In [1]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation

In [2]:
import re

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score,accuracy_score

In [3]:
data=pd.read_csv('Train.csv')
data.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [4]:
print(data['label'].value_counts())


0    20019
1    19981
Name: label, dtype: int64


In [5]:
pos_rev=data[data['label']==1][:5000]
neg_rev=data[data['label']==0][:5000]

new_data=pd.concat([pos_rev,neg_rev],axis=0)


In [6]:
final_data=new_data.sample(frac=1)
final_data.head()


Unnamed: 0,text,label
9575,There are so many more complexities to the plo...,1
8786,"""Ahh...I didn't order no amazing hit show"".......",1
9355,RKS films always have been commercial films wh...,0
7708,Here in Brazil is very rare to see a good Braz...,1
879,This is a gem. As a Film Four production - the...,1


In [7]:
final_data['label'].value_counts()


1    5000
0    5000
Name: label, dtype: int64

In [8]:
data=final_data['text']
useless_words=list(stopwords.words('english')+list(punctuation)+['<br />'])
lemmatizer=WordNetLemmatizer()
corpus=data.tolist()
corpus

["There are so many more complexities to the plot of this wonderful thought provoking movie than just infidelity and cover-up of responsibility for the accident. I was struck by the initial seeming goodness of husband Wilkinson who wanted the driver, when he thought it was Everett, disclosed to the police, and the change of heart (and morals) when he learned it was his wife. As well, was he indeed good, and/or was he attempting to redeem self by allowing her to go with Rupert. Then, things switchedand SHE decided the right ting to do was admit that she hit him. Most importantly the theme of redemption (for the accident - for the infidelity - in her own odd, flawed way)is strongest in Watson's sickbed care for Everett. I believe that is why she undertook that effort.<br /><br />This seems to be a common theme in modern British novels: Brideshead Revisited, The End of the Affair come to mind. Love it.",
 '"Ahh...I didn\'t order no amazing hit show"....."We\'ll you got one" Hack is simply

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
final_corpus=[]
    
for i in range(len(corpus)):
    word=word_tokenize(corpus[i].lower())
    word=[
        lemmatizer.lemmatize(y) for y in word if y not in useless_words
    ]
    j=" ".join(word)
    j=re.sub(r'[^\w\s]', " ", j)
    j=re.sub("(^|\W)\d+"," ",j)
    
    final_corpus.append(j)

In [11]:
label_=final_data['label']
train_df={
    'label':label_,
    'content':final_corpus,
}

train_df=pd.DataFrame(train_df)
train_df.head()

Unnamed: 0,label,content
9575,1,many complexity plot wonderful thought provoki...
8786,1,ahh n t order amazing hit show ...
9355,0,rks film always commercial film suited s gha...
7708,1,brazil rare see good brazilian film brant s ne...
879,1,gem film four production anticipated quality i...


In [12]:
tf=TfidfVectorizer()
vector=tf.fit_transform(train_df['content'])
x=vector.toarray()
y=train_df['label']
vector

<10000x47229 sparse matrix of type '<class 'numpy.float64'>'
	with 965918 stored elements in Compressed Sparse Row format>

In [13]:
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=0.2,random_state=101)

In [14]:
model_names=['Logistic Regression','Naive Bayes']
model=[LogisticRegression(),MultinomialNB()]

def get_results(model_names,model,x_train,x_test,y_train,y_test):
    
    
    model_accuracy_score=[]
    model_auc_score=[]
    
    for i in model:
        model_=i.fit(x_train,y_train)
        y_preds=model_.predict(x_test)
        
        score1=accuracy_score(y_test,y_preds)
        score2=roc_auc_score(y_test,y_preds,)
        
        model_accuracy_score.append(score1)
        model_auc_score.append(score2)
    
    result={
        'Model':model_names,
        'Accuracy Score': model_accuracy_score
    }
    
    result=pd.DataFrame(result)
    
    return result

In [15]:
new_df=get_results(model_names,model,X_train,X_test,Y_train,Y_test)

In [16]:
new_df


Unnamed: 0,Model,Accuracy Score
0,Logistic Regression,0.8795
1,Naive Bayes,0.857


In [17]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stop_words]
    
    # Join the tokens back into a string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# Example usage
corpus = ["This is the first document.", "This document is the second document."]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

# X contains the feature vectors for each document
print(X.toarray())

[[1 1 1 0 1 1]
 [2 0 1 1 1 1]]
