In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


In [2]:
amazon_df=pd.read_csv('Amazon_Reviews.csv')
amazon_df

Unnamed: 0,Review,Label
0,Stuning even for the non-gamer: This sound tr...,1
1,The best soundtrack ever to anything.: I'm re...,1
2,Amazing!: This soundtrack is my favorite musi...,1
3,Excellent Soundtrack: I truly like this sound...,1
4,"Remember, Pull Your Jaw Off The Floor After H...",1
...,...,...
194,A Book That Is Worth a Second Look: This book...,1
195,Best game ever: This games makes even amazing...,1
196,Guitar in Absentia: With all due respect to a...,0
197,Stiff and Smells like drying paint: You get w...,0


In [3]:
y=amazon_df['Label']
y

0      1
1      1
2      1
3      1
4      1
      ..
194    1
195    1
196    0
197    0
198    0
Name: Label, Length: 199, dtype: int64

In [4]:
amazon_df.drop(columns='Label',inplace=True)



In [5]:
amazon_df

Unnamed: 0,Review
0,Stuning even for the non-gamer: This sound tr...
1,The best soundtrack ever to anything.: I'm re...
2,Amazing!: This soundtrack is my favorite musi...
3,Excellent Soundtrack: I truly like this sound...
4,"Remember, Pull Your Jaw Off The Floor After H..."
...,...
194,A Book That Is Worth a Second Look: This book...
195,Best game ever: This games makes even amazing...
196,Guitar in Absentia: With all due respect to a...
197,Stiff and Smells like drying paint: You get w...


In [6]:
#train test split
X_train,X_test,y_train,y_test=train_test_split(amazon_df,y,test_size=0.2,random_state=40)
X_train['Review']


86      It Drew Air: This book drew air pretty bad. I...
182     It's a knock off.: The official Chrono Trigge...
69      Rapunzel: This is such a great game both my 3...
125     Delicious and Addictive: This cereal is great...
42      Alaska sourdough: REad most of the book while...
                             ...                        
50      Kitchen TV: I bought this to replace a 13" tu...
184     It was "ok" but I won't watch it again: I wat...
165     They're blocks - keep that in mind: We got th...
7       Glorious story: I loved Whisper of the wicked...
70      [Not what I expected]: ... My daughter and I ...
Name: Review, Length: 159, dtype: object

In [7]:
regexp=RegexpTokenizer(r'\w+')
stopwords_en=stopwords.words('english')
stopwords_en

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [8]:
lemmatizer=WordNetLemmatizer()
tf=TfidfVectorizer()

In [9]:
def preprocessing(review):
    tokens=regexp.tokenize(review)
    pure_tokens=[token.lower() for token in tokens if token.lower() not in stopwords_en]
    lemma_tokens=[lemmatizer.lemmatize(pure_token, pos='v') for pure_token in pure_tokens]
    return ' '.join(lemma_tokens)
    
    
    

In [10]:
X_train['Review']=X_train['Review'].apply(preprocessing)

X_test['Review']=X_test['Review'].apply(preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train['Review']=X_train['Review'].apply(preprocessing)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['Review']=X_test['Review'].apply(preprocessing)


In [11]:
X_train_tfidf=tf.fit_transform(X_train['Review'])


X_test_tfidf=tf.transform(X_test['Review'])

In [12]:
train_df=tf.fit_transform(X_train['Review'])

In [13]:
test_df=tf.transform(X_test['Review'])

In [17]:
logreg=LogisticRegression()
logreg.fit(X_train_tfidf,y_train)

logreg_pred=logreg.predict(X_test_tfidf)

In [23]:
confusion_matrix(y_test,logreg_pred)

array([[ 6, 16],
       [ 0, 18]], dtype=int64)