Natural Language Processing Techniques for Text Classification

In [None]:
import pandas as pd
df = pd.read_csv('/content/IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [None]:
df.duplicated().sum()

418

In [None]:
df.drop_duplicates()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [None]:
df['sentiment'].unique()

array(['positive', 'negative'], dtype=object)

In [None]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,25000
negative,25000


In [None]:
Sentiment_classes = {'negative': 0, 'positive': 1 }
df['sentiment'] = df['sentiment'].map(Sentiment_classes)
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [None]:
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
import re
def clean_text(text):
    # Remove words starting with '@'
    text = re.sub(r'@\w+', '', text)
    # Remove words starting with 'https://'
    text = re.sub(r'https://\S+', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Ensure the text ends with a period
    text = re.sub(r'#','',text)
    if not text.endswith('.'):
        text += '.'
    return text
df['Clean_review'] = df['review'].apply(clean_text)
df

Unnamed: 0,review,sentiment,Clean_review
0,One of the other reviewers has mentioned that ...,1,One of the other reviewers has mentioned that ...
1,A wonderful little production. <br /><br />The...,1,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...,1,I thought this was a wonderful way to spend ti...
3,Basically there's a family where a little boy ...,0,Basically there's a family where a little boy ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"Petter Mattei's ""Love in the Time of Money"" is..."
...,...,...,...
49995,I thought this movie did a down right good job...,1,I thought this movie did a down right good job...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,"Bad plot, bad dialogue, bad acting, idiotic di..."
49997,I am a Catholic taught in parochial elementary...,0,I am a Catholic taught in parochial elementary...
49998,I'm going to have to disagree with the previou...,0,I'm going to have to disagree with the previou...


In [None]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    # Clean the text
    text = clean_text(text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove punctuation
    tokens = [word for word in tokens if word.isalnum()]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Initialize lemmatizer
    lemmatizer = WordNetLemmatizer()
    # Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Reconstruct the text from tokens
    text = ' '.join(tokens)
    return text, tokens
df['preprocess_text']=df['Clean_review'].apply(preprocess_text)
df

Unnamed: 0,review,sentiment,Clean_review,preprocess_text
0,One of the other reviewers has mentioned that ...,1,One of the other reviewers has mentioned that ...,(one reviewer mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,1,A wonderful little production. <br /><br />The...,(wonderful little production br br filming tec...
2,I thought this was a wonderful way to spend ti...,1,I thought this was a wonderful way to spend ti...,(thought wonderful way spend time hot summer w...
3,Basically there's a family where a little boy ...,0,Basically there's a family where a little boy ...,(basically family little boy jake think zombie...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"Petter Mattei's ""Love in the Time of Money"" is...",(petter mattei love time money visually stunni...
...,...,...,...,...
49995,I thought this movie did a down right good job...,1,I thought this movie did a down right good job...,(thought movie right good job creative origina...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,"Bad plot, bad dialogue, bad acting, idiotic di...",(bad plot bad dialogue bad acting idiotic dire...
49997,I am a Catholic taught in parochial elementary...,0,I am a Catholic taught in parochial elementary...,(catholic taught parochial elementary school n...
49998,I'm going to have to disagree with the previou...,0,I'm going to have to disagree with the previou...,(going disagree previous comment side maltin o...


In [None]:
df['tokens'] = df['preprocess_text'].apply(lambda x: x[1])
df

Unnamed: 0,review,sentiment,Clean_review,preprocess_text,tokens
0,One of the other reviewers has mentioned that ...,1,One of the other reviewers has mentioned that ...,(one reviewer mentioned watching 1 oz episode ...,"[one, reviewer, mentioned, watching, 1, oz, ep..."
1,A wonderful little production. <br /><br />The...,1,A wonderful little production. <br /><br />The...,(wonderful little production br br filming tec...,"[wonderful, little, production, br, br, filmin..."
2,I thought this was a wonderful way to spend ti...,1,I thought this was a wonderful way to spend ti...,(thought wonderful way spend time hot summer w...,"[thought, wonderful, way, spend, time, hot, su..."
3,Basically there's a family where a little boy ...,0,Basically there's a family where a little boy ...,(basically family little boy jake think zombie...,"[basically, family, little, boy, jake, think, ..."
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,"Petter Mattei's ""Love in the Time of Money"" is...",(petter mattei love time money visually stunni...,"[petter, mattei, love, time, money, visually, ..."
...,...,...,...,...,...
49995,I thought this movie did a down right good job...,1,I thought this movie did a down right good job...,(thought movie right good job creative origina...,"[thought, movie, right, good, job, creative, o..."
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,"Bad plot, bad dialogue, bad acting, idiotic di...",(bad plot bad dialogue bad acting idiotic dire...,"[bad, plot, bad, dialogue, bad, acting, idioti..."
49997,I am a Catholic taught in parochial elementary...,0,I am a Catholic taught in parochial elementary...,(catholic taught parochial elementary school n...,"[catholic, taught, parochial, elementary, scho..."
49998,I'm going to have to disagree with the previou...,0,I'm going to have to disagree with the previou...,(going disagree previous comment side maltin o...,"[going, disagree, previous, comment, side, mal..."
