## Download Data From https://archive.ics.uci.edu/ml/machine-learning-databases/00331/

### Create and structure the data to make it ready for Processing

In [1]:
import pandas as pd
import numpy as np

In [3]:
yelp_data=pd.read_csv('yelp_labelled.txt',sep='\t',header=None)

In [5]:
yelp_data.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [7]:
column_name=['Review','Sentiment']

In [9]:
yelp_data.columns= column_name

In [11]:
yelp_data.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [13]:
yelp_data.shape

(1000, 2)

In [15]:
amazon_data=pd.read_csv('amazon_cells_labelled.txt',sep='\t',header=None)

In [17]:
amazon_data.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [19]:
amazon_data.columns=column_name

In [21]:
amazon_data.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [23]:
amazon_data.shape

(1000, 2)

In [25]:
imdb_data=pd.read_csv('imdb_labelled.txt',sep='\t',header=None)

In [27]:
imdb_data.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [29]:
imdb_data.columns=column_name

In [31]:
imdb_data.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [33]:
imdb_data.shape

(748, 2)

In [35]:
type(yelp_data)

pandas.core.frame.DataFrame

In [37]:
# Append does not work with newer version of pandas
#data= yelp_data.append([amazon_data,imdb_data],ignore_index=True)


In [39]:
data= pd.concat([yelp_data, imdb_data,amazon_data], ignore_index=True)

In [42]:
data.shape

(2748, 2)

In [44]:
data.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [46]:
# frac= fraction of axis where 1 means whole
data= data.sample(frac=1,random_state=42)

In [48]:
data.head()

Unnamed: 0,Review,Sentiment
2516,Great product.,1
2642,This product is very High quality Chinese CRAP...,0
1359,"Let's start with all the problemsthe acting, ...",0
1702,It's too bad that everyone else involved didn'...,0
2660,It always cuts out and makes a beep beep beep ...,0


In [50]:
data.to_csv('sentiment.csv', index=False)


In [24]:
len(data)

2748

In [25]:
data.Sentiment.value_counts().head(1)/len(data.Sentiment)

Sentiment
1    0.504367
Name: count, dtype: float64

In [26]:
data.isnull().sum()

Review       0
Sentiment    0
dtype: int64

In [30]:
X=data.Review
X.shape

(2748,)

In [27]:
data.head()

Unnamed: 0,Review,Sentiment
2516,Great product.,1
2642,This product is very High quality Chinese CRAP...,0
1359,"Let's start with all the problemsthe acting, ...",0
1702,It's too bad that everyone else involved didn'...,0
2660,It always cuts out and makes a beep beep beep ...,0


In [28]:
y=data.Sentiment

In [29]:

y.head()

2516    1
2642    0
1359    0
1702    0
2660    0
Name: Sentiment, dtype: int64

In [None]:
X.head()

In [31]:
import string

In [32]:
pun=string.punctuation

In [33]:
pun

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [34]:
import spacy
nlp=spacy.load('en_core_web_sm')


In [35]:
from spacy.lang.en.stop_words import STOP_WORDS
all_stop_words= list(STOP_WORDS)
from string import punctuation as pun

In [36]:
def preprocess_text(text):
  """Preprocesses text with lemmatization, stopword removal, and punctuation removal."""
  doc = nlp(text.lower())
  return [token.lemma_ for token in doc if not (token.is_stop or token.is_punct)]


def extract_lemmatized_words(text):
    # Process the text with spaCy
    doc = nlp(text)
    
    # Extract lemmatized words that are not stop words or punctuation
    lemmatized_words = [token.lemma_ for token in doc if token.lemma_ not in STOP_WORDS and not token.is_punct]
    
    return lemmatized_words



In [None]:
text= "Tom is at his happiest when his digestive system is working great, it makes tom  Joyful!! "

In [41]:
text

'Ritesh is at his happiest when his digestive system is working great, it makes ritesh very Joyful!! '

In [42]:
preprocess_text(text)

['ritesh',
 'happy',
 'digestive',
 'system',
 'work',
 'great',
 'make',
 'ritesh',
 'joyful']

In [None]:
#data['preprocessed_review']= data.Review.apply(preprocess_text)

In [None]:
#data.head()

In [None]:
#data['precossed_text']=pd.Series(data['precossed_text'])

In [None]:
type(data['precossed_text'])

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [44]:
# complete TFIDF happening on the processed text
tfidf= TfidfVectorizer(tokenizer=preprocess_text)

# Binary Classification from here on 

In [45]:
clf= LogisticRegression()

In [46]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.Review, data.Sentiment, test_size=0.33, random_state=42)

In [47]:
# This is a pipeline
model = Pipeline([('tfidf',tfidf),('classifier',clf)])

In [48]:
model.fit(X_train,y_train)



In [49]:
y_pred= model.predict(X_test)

In [50]:
from sklearn.metrics import accuracy_score

In [51]:
accuracy_score(y_test,y_pred)

0.7850055126791621

## Making Predictions

In [52]:
model.predict(['This is a great movie'])

array([1])

In [53]:
model.predict(['This is the worst movie'])

array([0])

In [54]:
model.predict(['pathetic movie I have ever seen'])

array([0])

In [56]:
model.predict(['not good movie'])

array([1])

In [55]:
if 'not' in all_stop_words:
    print(True)

True
