In [1]:
import numpy as np
import pandas as pd
import string
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [2]:
dataframe = pd.read_csv('fake reviews dataset.csv')
dataframe.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [3]:
if 'Unnamed: 0' in dataframe.columns:
    dataframe.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
dataframe.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [5]:
dataframe.dropna(inplace=True)
dataframe['length'] = dataframe['text_'].apply(len)
dataframe[dataframe['label']=='OR'][['text_','length']].sort_values(by='length',ascending=False).head().iloc[0].text_

'WEAK ON CURRENT SCIENCE.\nAfter seeing it twice, I agree with much (but not all) of the positive five star reviews. Out of respect for those who READ reviews, I\'ll not repeat everything that I like about the presentation. I found the goofy oversize earrings, hairdo, and facial hair arrangement of Daniel Vitalis, (described as a "Wild Food Expert") distracting. UGH. Ditto for David Wolfe, who had an extremely goofy wild hairdo. On the other hand, Jon Gabriel, described as an "author and weight loss expert" was nicely groomed and a good presenter. His story of personal transformation of a fellow of over 400 pounds (whew) to becoming a jock of normal weight was inspiring. Christiane Northrup preserves her rank as one of America\'s cutest doctors. A really nice looking woman! Presentations by Dr. Mercola, Jason Vale, Kris Carr, Alejandro Junger were fine. It was disappointing to have Jamie Oliver (so popular in the UK) give Baby Cow Growth Fluid a pass with unscientific but popular ideas

In [6]:
def convertmyTxt(rv):
    np = [c for c in rv if c not in string.punctuation]
    np = ''.join(np)
    return [w for w in np.split() if w.lower() not in stopwords.words('english')]

x_train, x_test, y_train, y_test = train_test_split(dataframe['text_'],dataframe['label'],test_size=0.25)

In [7]:
pip = Pipeline([
    ('bow',CountVectorizer(analyzer=convertmyTxt)),
    ('tfidf',TfidfTransformer()),
    ('classifier',RandomForestClassifier())
])

In [8]:
pip.fit(x_train, y_train)

KeyboardInterrupt: 

In [9]:
randomForestClassifier = pip.predict(x_test)
print('Accuracy of the model:',str(np.round(accuracy_score(y_test,randomForestClassifier)*100,2)) + '%')

In [10]:
pip = Pipeline([
    ('bow',CountVectorizer(analyzer=convertmyTxt)),
    ('tfidf',TfidfTransformer()),
    ('classifier',SVC())
])

In [11]:
pip.fit(x_train,y_train)

In [12]:
logisticRegression = pip.predict(x_test)
logisticRegression

In [13]:
print('accuracy of the model:',str(np.round(accuracy_score(y_test,logisticRegression)*100,2)) + '%')

In [18]:
def predict_reviews(input_data, pipelines):
    predictions = {}
    for model_name, pipeline in pipelines.items():
        predictions[model_name] = pipeline.predict([input_data])[0]
    return predictions

pipelines = {
    'RandomForest': Pipeline(steps=[
        ('bow', CountVectorizer(analyzer=convertmyTxt)),
        ('tfidf', TfidfTransformer()),
        ('classifier', RandomForestClassifier())
    ]),
    'SVC': Pipeline(steps=[
        ('bow', CountVectorizer(analyzer=convertmyTxt)),
        ('tfidf', TfidfTransformer()),
        ('classifier', SVC())
    ]),
    'LogisticRegression': Pipeline(steps=[
        ('bow', CountVectorizer(analyzer=convertmyTxt)),
        ('tfidf', TfidfTransformer()),
        ('classifier', LogisticRegression())
    ])
}
pipelines['RandomForest'].fit(x_train, y_train)
pipelines['SVC'].fit(x_train, y_train)
pipelines['LogisticRegression'].fit(x_train, y_train)


In [25]:
input_data = "Helmet is is very good Thankyou for amezan."
predictions = predict_reviews(input_data, pipelines)
real_or_fake = 'real' if list(predictions.values()).count('CG') >= 2 else 'spam'
print(f"The review is {real_or_fake}")