In [1]:
import pandas as pd 
data = pd.read_csv("https://raw.githubusercontent.com/AIP-BITS/ML-CISCO/main/data/food-sentiment.csv")

In [2]:
data.head()

Unnamed: 0,Text,Sentiment
0,This coffee has a great bold flavor without th...,1
1,This is one of the best coconut oils. Nice ar...,1
2,This candy is sort of like those sugar coated ...,1
3,Description says 45 total whole leaf pouches b...,0
4,I love this popcorn so much. I pop it in my W...,1


In [3]:
X = data.Text
y = data.Sentiment

In [4]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True,max_df=0.5,stop_words="english")
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [8]:
X_train.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [10]:
%%time
from sklearn.naive_bayes import GaussianNB 
model = GaussianNB() 
model.fit(X_train.toarray(),y_train)

CPU times: user 779 ms, sys: 499 ms, total: 1.28 s
Wall time: 1.28 s


In [11]:
from sklearn.metrics import accuracy_score 
pred_train = model.predict(X_train.toarray())
print("Training Accuracy",accuracy_score(y_train,pred_train))

pred_test = model.predict(X_test.toarray())
print("Testing Accuracy",accuracy_score(y_test,pred_test))

Training Accuracy 0.821
Testing Accuracy 0.6485


In [13]:
%%time
from sklearn.naive_bayes import BernoulliNB 
model = BernoulliNB() 
model.fit(X_train,y_train)

CPU times: user 5.99 ms, sys: 485 µs, total: 6.48 ms
Wall time: 5.46 ms


In [14]:
from sklearn.metrics import accuracy_score 
pred_train = model.predict(X_train)
print("Training Accuracy",accuracy_score(y_train,pred_train))

pred_test = model.predict(X_test)
print("Testing Accuracy",accuracy_score(y_test,pred_test))

Training Accuracy 0.873375
Testing Accuracy 0.8585


In [15]:
%%time
from sklearn.naive_bayes import MultinomialNB 
model = MultinomialNB() 
model.fit(X_train,y_train)

CPU times: user 3.63 ms, sys: 0 ns, total: 3.63 ms
Wall time: 2.93 ms


In [16]:
from sklearn.metrics import accuracy_score 
pred_train = model.predict(X_train)
print("Training Accuracy",accuracy_score(y_train,pred_train))

pred_test = model.predict(X_test)
print("Testing Accuracy",accuracy_score(y_test,pred_test))

Training Accuracy 0.85125
Testing Accuracy 0.8735


In [18]:
data["Sentiment"].value_counts()

1    8557
0    1443
Name: Sentiment, dtype: int64

In [19]:
%%time
from sklearn.naive_bayes import ComplementNB 
model = ComplementNB() 
model.fit(X_train,y_train)

CPU times: user 4.08 ms, sys: 9 µs, total: 4.09 ms
Wall time: 4.1 ms


In [20]:
from sklearn.metrics import accuracy_score 
pred_train = model.predict(X_train)
print("Training Accuracy",accuracy_score(y_train,pred_train))

pred_test = model.predict(X_test)
print("Testing Accuracy",accuracy_score(y_test,pred_test))

Training Accuracy 0.879125
Testing Accuracy 0.879


# Logistic Regression

In [21]:
%%time
from sklearn.linear_model import LogisticRegression
model = LogisticRegression() 
model.fit(X_train,y_train)

CPU times: user 715 ms, sys: 2.55 s, total: 3.27 s
Wall time: 298 ms


In [22]:
from sklearn.metrics import accuracy_score 
pred_train = model.predict(X_train)
print("Training Accuracy",accuracy_score(y_train,pred_train))

pred_test = model.predict(X_test)
print("Testing Accuracy",accuracy_score(y_test,pred_test))

Training Accuracy 0.897625
Testing Accuracy 0.8985


In [25]:
import warnings
warnings.filterwarnings('ignore')

import time 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score 

result = pd.DataFrame()


for solv in ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']:
    for pen in ['l1', 'l2', 'elasticnet', 'none']:
        
        try:
            model = None 
            tic = time.time()
            model = LogisticRegression(solver=solv,penalty=pen)
            model.fit(X_train,y_train)

            train_time = time.time()-tic

            tic = time.time()
            pred_train = model.predict(X_train)
            pred_test = model.predict(X_test)

            pred_time = time.time()-tic

            train_accuracy = accuracy_score(y_train,pred_train)
            test_accuracy = accuracy_score(y_test,pred_test)

            df = {
                "solver":solv,
                "penalty":pen,
                "train_accuracy":train_accuracy,
                "test_accuracy":test_accuracy,
                "train_time":train_time,
                "pred_time":pred_time
            }

            result=result.append(df,ignore_index=True)
        except:
            pass
        
        
        
        
        

In [26]:
result

Unnamed: 0,solver,penalty,train_accuracy,test_accuracy,train_time,pred_time
0,newton-cg,l2,0.897625,0.8985,0.168446,0.000819
1,newton-cg,none,1.0,0.8955,0.580745,0.00091
2,lbfgs,l2,0.897625,0.8985,0.181306,0.000829
3,lbfgs,none,1.0,0.898,0.146353,0.000819
4,liblinear,l1,0.89975,0.903,0.064584,0.001096
5,liblinear,l2,0.89775,0.899,0.041241,0.001013
6,sag,l2,0.897625,0.8985,0.069845,0.000554
7,sag,none,1.0,0.899,0.263913,0.000553
8,saga,l1,0.899875,0.903,0.56873,0.000648
9,saga,l2,0.89775,0.8995,0.057304,0.000526
