# 20newsgroups

In [39]:
from sklearn.datasets import fetch_20newsgroups 

categories = [
    "alt.atheism",
    "talk.religion.misc",
    "comp.graphics",
    "sci.space",
]

data_train = fetch_20newsgroups(
    subset="train", categories=categories, shuffle=True, random_state=42
)

data_test = fetch_20newsgroups(
    subset="test", categories=categories, shuffle=True, random_state=42
)

In [40]:
X_train = data_train["data"]
y_train = data_train["target"]

X_test = data_test["data"]
y_test = data_test["target"]

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

USE_HASHING=False

if USE_HASHING:
    vectorizer = HashingVectorizer(
        stop_words="english", alternate_sign=False, n_features=N_FEATURES
    )
    X_train = vectorizer.transform(data_train.data)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")
    X_train = vectorizer.fit_transform(data_train.data)
    
X_test = vectorizer.transform(data_test.data)

In [42]:
import time
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score

result = pd.DataFrame()

for solv in ["liblinear", "newton-cg", "lbfgs", "sag","saga"]:
    for penalty in ["l1", "l2", "elasticnet", "none"]:
        
        model = None
        tic = time.time()
        model = LogisticRegression(solver=solv)
        model.fit(X_train, y_train)

        train_time = time.time()-tic

        tic = time.time()
        pred_train=model.predict(X_train)
        pred_test=model.predict(X_test)

        pred_time = time.time()-tic

        df = {"solver":solv,"train_time":train_time,"pred_time":pred_time,
              "Train accuracy":accuracy_score(y_train,pred_train),
              "Test accuracy":accuracy_score(y_test,pred_test),
              "penalty":penalty
             }
        result=result.append(df, ignore_index = True)

In [43]:
result[["solver","penalty",'Train accuracy','Test accuracy','train_time','pred_time']]

Unnamed: 0,solver,penalty,Train accuracy,Test accuracy,train_time,pred_time
0,liblinear,l1,0.992625,0.883222,0.099522,0.0
1,liblinear,l2,0.992625,0.883222,0.089783,0.0
2,liblinear,elasticnet,0.992625,0.883222,0.100008,0.00983
3,liblinear,none,0.992625,0.883222,0.090318,0.0
4,newton-cg,l1,0.995575,0.889135,0.44976,0.0
5,newton-cg,l2,0.995575,0.889135,0.559627,0.0
6,newton-cg,elasticnet,0.995575,0.889135,0.460199,0.0
7,newton-cg,none,0.995575,0.889135,0.510408,0.009488
8,lbfgs,l1,0.995575,0.889135,1.7201,0.0
9,lbfgs,l2,0.995575,0.889135,1.88027,0.00501


# Food-Review

In [30]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd 
data = pd.read_csv("../data/food-review/FoodReviews.csv",index_col="Id")

In [31]:
data = data.sample(5000)

review_data = data[['Text','Score']]
review_data[['Sentiment']] = review_data.Score.map(lambda x: 0 if x < 3 else 1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(review_data.Text,review_data.Sentiment,
                                                    test_size=.20)

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer

USE_HASHING=False

if USE_HASHING:
    vectorizer = HashingVectorizer(
        stop_words="english", alternate_sign=False
    )
    X_train = vectorizer.transform(X_train)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words="english")
    X_train = vectorizer.fit_transform(X_train)
    
X_test = vectorizer.transform(X_test)

In [33]:
import time
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score

result = pd.DataFrame()

for solv in ["liblinear", "newton-cg", "lbfgs", "sag","saga"]:
    for penalty in ["l1", "l2", "elasticnet", "none"]:
        
        model = None
        tic = time.time()
        model = LogisticRegression(solver=solv)
        model.fit(X_train, y_train)

        train_time = time.time()-tic

        tic = time.time()
        pred_train=model.predict(X_train)
        pred_test=model.predict(X_test)

        pred_time = time.time()-tic

        df = {"solver":solv,"train_time":train_time,"pred_time":pred_time,
              "Train accuracy":accuracy_score(y_train,pred_train),
              "Test accuracy":accuracy_score(y_test,pred_test),
              "penalty":penalty
             }
        result=result.append(df, ignore_index = True)

In [36]:
result[["solver","penalty",'Train accuracy','Test accuracy','train_time','pred_time']]

Unnamed: 0,solver,penalty,Train accuracy,Test accuracy,train_time,pred_time
0,liblinear,l1,0.88025,0.855,0.020312,0.0
1,liblinear,l2,0.88025,0.855,0.019894,0.0
2,liblinear,elasticnet,0.88025,0.855,0.019868,0.0
3,liblinear,none,0.88025,0.855,0.017497,0.0
4,newton-cg,l1,0.88,0.855,0.050468,0.0
5,newton-cg,l2,0.88,0.855,0.049619,0.0
6,newton-cg,elasticnet,0.88,0.855,0.04956,0.0
7,newton-cg,none,0.88,0.855,0.050051,0.0
8,lbfgs,l1,0.88,0.855,0.100354,0.0
9,lbfgs,l2,0.88,0.855,0.090133,0.0
