## IMPORTING PACKAGES

In [1]:
import pandas as pd
from collections import Counter
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn import preprocessing
import math
import ast
import json
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
import matplotlib.pyplot as plt
import seaborn as sns

##  MACHINE LEARNING

In [2]:
def datatype(df):
    ind=[]
    for i in range(len(df["review"])):
        if len(df["review"][i])==0:
            ind.append(i)
    df.drop(ind,inplace=True)
    return df 

def sentiment_scores(data):
    sid_obj = SentimentIntensityAnalyzer()
    sc=[]
    for i in data['review']:
        string=" "
        l=string.join(i)
    
        sentiment_dict = sid_obj.polarity_scores(l)
    
        # decide sentiment as positive, negative and neutral
        if sentiment_dict['compound'] >= 0.05 :
            sc.append("Positive")

        elif sentiment_dict['compound'] <= - 0.05 :
            sc.append("Negative")

        else :
            sc.append("Neutral") 
    data['Sentiment_vader']=sc
    return data

def count_doc(data,process):
    doc=[j for i in data for j in i]
    c={}
    unq=list(set(doc))
    for j in unq:
        for i in data:
            if j in i: 
                if j not in c:
                    c[j]=1
                else:
                    c[j]+=1
    if process=="fit":
        df=pd.DataFrame(columns=unq)
        df.sort_index(axis=1,inplace=True)
        return c,df
    else:
        return c  
    
def tf_idf_fit(data):
    c,df=count_doc(data,"fit")
    len_data=len(data)
    for i in data.index:
        d=data[i]
        s_d=set(d)
        len_data_i=len(d)
        c_list=Counter(d)
        for j in d:
            tf=(c_list[j]/len_data_i)
            idf=(math.log((len_data+1)/(c[j]+1))+1)
            df.loc[i,j]=tf*idf
    df.fillna(0,inplace=True)
    return pd.DataFrame(preprocessing.normalize(df, axis=1), columns=df.columns)


def tf_idf_transform(data,df):
    len_data,len_df,df_col=len(data),len(df),df.columns
    c=count_doc(data,"transform")
    print(range(len_df,len_df+len_data))
    for i,ind in zip(data.index,range(len_df,len_df+len_data)):
        l=data[i]
        len_l=len(l)
        c_list=Counter(l)
        for j in l:
            if j in df_col:
                tf=(c_list[j]/len_l)
                idf=math.log((len_data+1)/(c[j]+1))+1
                df.loc[ind,j]=tf*idf
    df.fillna(0,inplace=True)
    df=pd.DataFrame(preprocessing.normalize(df, axis=1), columns=df.columns)
    return df.iloc[len_df:]

def classes(data):
    pos=len(data.loc[data["Sentiment_vader"]=="Positive"])
    neg=len(data.loc[data["Sentiment_vader"]=="Negative"])
    neu=len(data.loc[data["Sentiment_vader"]=="Neutral"])
    s=['Positive','Negative','Neutral']
    t=[pos,neg,neu]
    return pd.DataFrame({'Classes':s,'No.of Reviews':t}) 

def evaluation_metrics(data,df,a):
    global newdf,d,d1
    name=[]
    acc=[]
    d=[]
    name1=[]
    acc1=[]
    d1=[]
    X_train, X_test, y_train, y_test = train_test_split(data, df['Sentiment_vader'], random_state = 0,test_size=0.25)
    
    #rf
    rf=RandomForestClassifier()
    rf.fit(X_train, y_train)
    predictions = rf.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name1.append("Random Forest")
    acc1.append(k["accuracy"])
    d1.append(rf.predict(a))

    #lr
    lr=LogisticRegression()
    lr.fit(X_train, y_train)
    predictions = lr.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name1.append("Logistic Regression")
    acc1.append(k["accuracy"])
    d1.append(lr.predict(a))

    
    #gradient boost
    gb=GradientBoostingClassifier() 
    gb.fit(X_train, y_train)
    predictions = gb.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name1.append("Gradient Boosting")
    acc1.append(k["accuracy"])
    d1.append(gb.predict(a))

    #adaboost
    ada=AdaBoostClassifier()
    ada.fit(X_train, y_train)
    predictions = ada.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name1.append("Ada Boosting")
    acc1.append(k["accuracy"])
    d1.append(ada.predict(a))


    #Hetreogeneous bagging-rf,lr
    clf_voting_15 = VotingClassifier(
    estimators=[
    ("rf",rf),
    ("lr",lr)])
    clf_voting_15.fit(X_train, y_train)
    predictions = clf_voting_15.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Random Forest and Logistic Regression")
    acc.append(k["accuracy"])
    d.append(clf_voting_15.predict(a))

    

    #Hetreogeneous bagging-rf,gb
    clf_voting_16 = VotingClassifier(
    estimators=[
    ("rf",rf),
    ("gb",gb)])
    clf_voting_16.fit(X_train, y_train)
    predictions = clf_voting_16.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Random forest and Gradient Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_16.predict(a))

    #Hetreogeneous bagging-lr,gb
    clf_voting_17 = VotingClassifier(
    estimators=[
    ("lr",lr),
    ("gb",gb)])
    clf_voting_17.fit(X_train, y_train)
    predictions = clf_voting_17.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Logistic Regression and Gradient Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_17.predict(a))

    #Hetreogeneous bagging-rf,gb,lr
    clf_voting_18 = VotingClassifier(
    estimators=[
    ("rf",rf),
    ("gb",gb),
    ("lr",lr)])
    clf_voting_18.fit(X_train, y_train)
    predictions = clf_voting_18.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Random forest,Logistic Regression with Gradient Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_18.predict(a))
    
    #Hetreogeneous bagging-rf,ada
    clf_voting_19 = VotingClassifier(
    estimators=[
    ("rf",rf),
    ("ada",ada)])
    clf_voting_19.fit(X_train, y_train)
    predictions = clf_voting_19.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Random forest with Ada Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_19.predict(a))

    #Hetreogeneous bagging-lr,ada
    clf_voting_20 = VotingClassifier(
    estimators=[
    ("lr",lr),
    ("ada",ada)])
    clf_voting_20.fit(X_train, y_train)
    predictions = clf_voting_20.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Logistic Regression with Ada Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_20.predict(a))


    #Hetreogeneous bagging-rf,ada,lr
    clf_voting_21 = VotingClassifier(
    estimators=[
    ("rf",rf),
    ("ada",ada),
    ("lr",lr)])
    clf_voting_21.fit(X_train, y_train)
    predictions = clf_voting_21.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Random forest,Logistic Regression with Ada Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_21.predict(a))

    #Hetreogeneous bagging-rf,gb,lr,ada
    clf_voting_22 = VotingClassifier(
    estimators=[
    ("rf",rf),
    ("gb",gb),
    ('ada',ada),
    ("lr",lr)])
    clf_voting_22.fit(X_train, y_train)
    predictions = clf_voting_22.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Random forest,Logistic Regression with Ada Boosting,Gradient Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_22.predict(a))

    #Hetreogeneous bagging-rf,gb,ada
    clf_voting_23 = VotingClassifier(
    estimators=[
    ("rf",rf),
    ("gb",gb),
    ("ada",ada)])
    clf_voting_23.fit(X_train, y_train)
    predictions = clf_voting_23.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Random forest with Ada Boosting and Gradient Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_23.predict(a))

    #Hetreogeneous bagging-lr,gb,ada
    clf_voting_24 = VotingClassifier(
    estimators=[
    ("lr",lr),
    ("gb",gb),
    ("ada",ada)])
    clf_voting_24.fit(X_train, y_train)
    predictions = clf_voting_24.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Logistic Regression with Ada Boosting and Gradient Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_24.predict(a))
    

    clf_voting_25 = VotingClassifier(
    estimators=[
    ("lr",lr),
    ("gb",gb)])
    clf_voting_25.fit(X_train, y_train)
    predictions = clf_voting_25.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Logistic Regression with Gradient Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_25.predict(a))


    #bagging-rf
    sample=10
    brf=BaggingClassifier(base_estimator=rf,n_estimators=sample)
    brf.fit(X_train,y_train)
    predictions=brf.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Bagging-Random Forest")
    acc.append(k["accuracy"])
    d.append(brf.predict(a))

    #bagging-lr
    sample=10
    blr=BaggingClassifier(base_estimator=lr,n_estimators=sample)
    blr.fit(X_train,y_train)
    predictions=blr.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Bagging-Logistic Regression")
    acc.append(k["accuracy"])
    d.append(blr.predict(a))


    #hetreogeneous bagging-rf,brf-lr,bfr(low)
    clf_voting_1= VotingClassifier(
    estimators=[
    ("lr",lr),
    ("blr",blr)])
    clf_voting_1.fit(X_train, y_train)
    predictions = clf_voting_1.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Logistic Regression,Bagging logistic regression")
    acc.append(k["accuracy"])
    d.append(clf_voting_1.predict(a))


    clf_voting_2= VotingClassifier(
    estimators=[
    ("rf",brf),
    ("brf",rf)])
    clf_voting_2.fit(X_train, y_train)
    predictions = clf_voting_2.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Random Forest,Bagging random forest")
    acc.append(k["accuracy"])
    d.append(clf_voting_2.predict(a))

    #hetreogeneous bagging-ada,brf
    clf_voting_3 = VotingClassifier(
    estimators=[
    ("ada",ada),
    ("brf",brf)])
    clf_voting_3.fit(X_train, y_train)
    predictions = clf_voting_3.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Bagging random forest with Ada Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_3.predict(a))

    #hetreogeneous bagging-gb,brf
    clf_voting_4 = VotingClassifier(
    estimators=[
    ("gb",gb),
    ("brf",brf)])
    clf_voting_4.fit(X_train, y_train)
    predictions = clf_voting_4.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Bagging random forest with Gradient Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_4.predict(a))


    #hetreogeneous bagging-ada,blr
    clf_voting_5 = VotingClassifier(
    estimators=[
    ("ada",ada),
    ("blr",blr)])
    clf_voting_5.fit(X_train, y_train)
    predictions = clf_voting_5.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Bagging Logistic Regression with Ada Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_5.predict(a))

    #hetreogeneous bagging-gb,blr
    clf_voting_6 = VotingClassifier(
    estimators=[
    ("gb",gb),
    ("blr",blr)])
    clf_voting_6.fit(X_train, y_train)
    predictions = clf_voting_6.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Bagging Logistic Regression with Gradient Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_6.predict(a))


    #hetreogeneous bagging-rf,brf,gb
    clf_voting_7 = VotingClassifier(
    estimators=[
    ("gb",gb),
    ("brf",brf),
    ("rf",rf)])
    clf_voting_7.fit(X_train, y_train)
    predictions = clf_voting_7.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Random Forest,Bagging random forest with Gradient Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_7.predict(a))

    #hetreogeneous bagging-lr,blr,gb
    clf_voting_8 = VotingClassifier(
    estimators=[
    ("gb",gb),
    ("blr",blr),
    ("lr",lr)])
    clf_voting_8.fit(X_train, y_train)
    predictions = clf_voting_8.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Logistic Regression,Bagging logistic regression with Gradient Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_8.predict(a))

    #hetreogeneous bagging-lr,blr,ada
    clf_voting_9 = VotingClassifier(
    estimators=[
    ("ada",ada),
    ("blr",blr),
    ("lr",lr)])
    clf_voting_9.fit(X_train, y_train)
    predictions = clf_voting_9.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Logistic Regression,Bagging logistic regression with Ada Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_9.predict(a))

    #hetreogeneous bagging-rf,brf,ada
    clf_voting_10 = VotingClassifier(
    estimators=[
    ("ada",ada),
    ("brf",brf),
    ("rf",rf)])
    clf_voting_10.fit(X_train, y_train)
    predictions = clf_voting_10.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Random Forest,Bagging Random forest with Ada Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_10.predict(a))

    #hetreogeneous bagging-gb,ada,brf
    clf_voting_11 = VotingClassifier(
    estimators=[
    ("gb",gb),
    ("ada",ada),
    ("brf",brf)])
    clf_voting_11.fit(X_train, y_train)
    predictions = clf_voting_11.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Bagging Random forest with Gradient Boosting and Ada Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_11.predict(a))

    #hetreogeneous bagging-ada,blr,gb
    clf_voting_12 = VotingClassifier(
    estimators=[
    ("gb",gb),
    ("blr",blr),
    ("ada",ada)])
    clf_voting_12.fit(X_train, y_train)
    predictions = clf_voting_12.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Bagging logistic regression with Gradient Boosting, Ada Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_12.predict(a))


    #hetreogeneous bagging-gb,blr,lr,ada
    clf_voting_13 = VotingClassifier(
    estimators=[
    ("gb",gb),
    ("blr",blr),
    ("lr",lr),
    ("ada",ada)])
    clf_voting_13.fit(X_train, y_train)
    predictions = clf_voting_13.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Logistic Regression,Bagging logistic regression with Gradient Boosting,Ada Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_13.predict(a))

    #hetreogeneous bagging-gb,brf,rf,ada
    clf_voting_14 = VotingClassifier(
    estimators=[
    ("gb",gb),
    ("brf",brf),
    ("rf",rf),
    ("ada",ada)])
    clf_voting_14.fit(X_train, y_train)
    predictions = clf_voting_14.predict(X_test)
    k=classification_report(y_test, predictions,output_dict=True)
    name.append("Random Forest,Bagging Random Forest with Gradient Boosting,Ada Boosting")
    acc.append(k["accuracy"])
    d.append(clf_voting_14.predict(a))
    
    sup=pd.DataFrame(columns=['methods','accuracy','prediction'],index=range(2))
    newdf=pd.DataFrame(columns=['Ensemble method','accuracy','prediction'],index=range(5))
    accuracy_df=pd.DataFrame({'Ensemble method':name,'Accuracy':acc,'prediction':d})
    df1=accuracy_df.copy(deep=True)

    for i in range(len(newdf)):
        a=(accuracy_df[['Accuracy']].idxmax()).Accuracy
        newdf['Ensemble method'][i]=df1['Ensemble method'][a]
        newdf['accuracy'][i]=df1['Accuracy'][a]
        newdf['prediction'][i]=df1['prediction'][a]
        accuracy_df.drop(a,inplace=True)

    for i in range(len(sup)):
        sup['methods'][i]=name1[i]
        sup['accuracy'][i]=acc1[i]
        sup['prediction'][i]=d1[i]
        
   
    return newdf.drop("prediction",axis=1),sup.drop("prediction",axis=1),newdf['prediction'][0]

    