# 20newsgroups

In [1]:
import pandas as pd 
import numpy as np 
df = pd.read_csv("https://raw.githubusercontent.com/AIP-BITS/ML-CISCO/main/data/news-group.csv")

In [2]:
X = df.data
y = df.target

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

USE_HASHING=False

if USE_HASHING:
    vectorizer = HashingVectorizer(
        stop_words="english", alternate_sign=False, n_features=N_FEATURES
    )
    X_train = vectorizer.transform(X_train)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")
    X_train = vectorizer.fit_transform(X_train)
    
X_test = vectorizer.transform(X_test)

In [5]:
import time
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score

result = pd.DataFrame()

for solv in ["liblinear", "newton-cg", "lbfgs", "sag","saga"]:
    for penalty in ["l1", "l2", "elasticnet", "none"]:
        
        model = None
        tic = time.time()
        model = LogisticRegression(solver=solv)
        model.fit(X_train, y_train)

        train_time = time.time()-tic

        tic = time.time()
        pred_train=model.predict(X_train)
        pred_test=model.predict(X_test)

        pred_time = time.time()-tic

        df = {"solver":solv,"train_time":train_time,"pred_time":pred_time,
              "Train accuracy":accuracy_score(y_train,pred_train),
              "Test accuracy":accuracy_score(y_test,pred_test),
              "penalty":penalty
             }
        result=result.append(df, ignore_index = True)

In [6]:
result[["solver","penalty",'Train accuracy','Test accuracy','train_time','pred_time']]

Unnamed: 0,solver,penalty,Train accuracy,Test accuracy,train_time,pred_time
0,liblinear,l1,0.984496,0.955752,0.129792,0.0
1,liblinear,l2,0.984496,0.955752,0.125008,0.0
2,liblinear,elasticnet,0.984496,0.955752,0.125062,0.005013
3,liblinear,none,0.984496,0.955752,0.125004,0.005013
4,newton-cg,l1,0.99151,0.958702,0.596881,0.003175
5,newton-cg,l2,0.99151,0.958702,0.641918,0.005013
6,newton-cg,elasticnet,0.99151,0.958702,0.590137,0.005012
7,newton-cg,none,0.99151,0.958702,0.556953,0.003102
8,lbfgs,l1,0.99151,0.958702,1.747691,0.012768
9,lbfgs,l2,0.99151,0.958702,1.853001,0.005008


# Food-Review

In [7]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
data = pd.read_csv("https://raw.githubusercontent.com/AIP-BITS/ML-CISCO/main/data/food-sentiment.csv")

In [8]:
# data = data.sample(5000)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data.Text,data.Sentiment,
                                                    test_size=.20)

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

USE_HASHING=False

if USE_HASHING:
    vectorizer = HashingVectorizer(
        stop_words="english", alternate_sign=False
    )
    X_train = vectorizer.transform(X_train)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")
    X_train = vectorizer.fit_transform(X_train)
    
X_test = vectorizer.transform(X_test)

In [10]:
import time
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score

result = pd.DataFrame()

for solv in ["liblinear", "newton-cg", "lbfgs", "sag","saga"]:
    for penalty in ["l1", "l2", "elasticnet", "none"]:
        
        model = None
        tic = time.time()
        model = LogisticRegression(solver=solv)
        model.fit(X_train, y_train)

        train_time = time.time()-tic

        tic = time.time()
        pred_train=model.predict(X_train)
        pred_test=model.predict(X_test)

        pred_time = time.time()-tic

        df = {"solver":solv,"train_time":train_time,"pred_time":pred_time,
              "Train accuracy":accuracy_score(y_train,pred_train),
              "Test accuracy":accuracy_score(y_test,pred_test),
              "penalty":penalty
             }
        result=result.append(df, ignore_index = True)

In [11]:
result[["solver","penalty",'Train accuracy','Test accuracy','train_time','pred_time']]

Unnamed: 0,solver,penalty,Train accuracy,Test accuracy,train_time,pred_time
0,liblinear,l1,0.8985,0.8855,0.050236,0.0
1,liblinear,l2,0.8985,0.8855,0.04,0.0
2,liblinear,elasticnet,0.8985,0.8855,0.040156,0.0
3,liblinear,none,0.8985,0.8855,0.030001,0.0
4,newton-cg,l1,0.89875,0.8855,0.089963,0.0
5,newton-cg,l2,0.89875,0.8855,0.100089,0.0
6,newton-cg,elasticnet,0.89875,0.8855,0.090065,0.0
7,newton-cg,none,0.89875,0.8855,0.091237,0.0
8,lbfgs,l1,0.89875,0.8855,0.129718,0.0
9,lbfgs,l2,0.89875,0.8855,0.139968,0.0
