In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
col_names = open("../data/word_indices_labels.txt", "r")
col_names_df = pd.read_csv(col_names, sep = " ", header=None)
list_names = col_names_df[0].tolist()
list_names

['dlr',
 'new',
 'york',
 'sale',
 'time',
 'cocoa',
 'dec',
 'smith',
 'juli',
 'sept',
 'crop',
 'bag',
 'mln',
 'bahia',
 'aug',
 'april',
 'port',
 'oct',
 'total',
 'june',
 'end',
 'februari',
 'week',
 'tonn',
 'figur',
 'year',
 'bean',
 'review',
 'open',
 'arriv',
 'shipment',
 'butter',
 'area',
 'period',
 'doubt',
 'come',
 'farmer',
 'limit',
 'estim',
 'uruguai',
 'shipper',
 'sold',
 'export',
 'currenc',
 'hand',
 'expect',
 'cake',
 'certif',
 'salvador',
 'destin',
 'make',
 'humid',
 'normal',
 'middai',
 'drought',
 'level',
 'routin',
 'offer',
 'held',
 'dry',
 'book',
 'went',
 'name',
 'practic',
 'superior',
 'improv',
 'good',
 'zone',
 'lower',
 'thousand',
 'hundr',
 'light',
 'made',
 'cumul',
 'price',
 'prospect',
 'fob',
 'publish',
 'earlier',
 'allevi',
 'rose',
 'obtain',
 'view',
 'sell',
 'commis',
 'continu',
 'part',
 'avail',
 'harvest',
 'brazilian',
 'stand',
 'mean',
 'fit',
 'januari',
 'processor',
 'earli',
 'late',
 'includ',
 'convert',


In [3]:
train_labels = open("../data/training_labels.txt", "r")
train_class_df = pd.read_csv(train_labels, header=None, names=["class"])
train_class_series = train_class_df.squeeze()
training_df = pd.read_csv("../data/training_data.csv", header=None, names=list_names)

In [4]:
test_labels = open("../data/testing_labels.txt", "r")
testing_class_df = pd.read_csv(test_labels, header=None, names=["class"])
testing_class_series = testing_class_df.squeeze()
testing_df = pd.read_csv("../data/testing_data.csv", header=None, names=list_names)

In [5]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
training_sc_df = sc.fit_transform(training_df)
testing_sc_df = sc.transform(testing_df)

In [6]:
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(training_sc_df, train_class_series)

BernoulliNB()

In [7]:
y_prediction = classifier.predict(testing_sc_df)

In [8]:
from sklearn.metrics import confusion_matrix, accuracy_score
confusion_matrix(testing_class_series, y_prediction)

array([[1040,   47],
       [  97,  622]], dtype=int64)

In [9]:
accuracy_score(testing_class_series, y_prediction)

0.920265780730897

In [10]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

num_features = 1000
mi_selector = SelectKBest(mutual_info_classif, k=num_features)
training_mi_df = mi_selector.fit_transform(training_df, train_class_series)
testing_mi_df = mi_selector.transform(testing_df)

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

naive_bayes = MultinomialNB()
naive_bayes.fit(training_mi_df, train_class_series)
naive_predict = naive_bayes.predict(testing_mi_df)

log_reg = LogisticRegression(solver="newton-cg", multi_class="multinomial")
log_reg.fit(training_mi_df, train_class_series)
log_reg_predict = log_reg.predict(testing_mi_df)

print('Naive Accuracy score: ', format(accuracy_score(naive_predict, testing_class_series)))
print('Logistic Regress Accuracy score: ', format(accuracy_score(log_reg_predict, testing_class_series)))

Naive Accuracy score:  0.9767441860465116
Logistic Regress Accuracy score:  0.9850498338870431


In [12]:
num_features = 100
mi_selector_100 = SelectKBest(mutual_info_classif, k=num_features)
training_mi_df_100 = mi_selector_100.fit_transform(training_df, train_class_series)
testing_mi_df_100 = mi_selector_100.transform(testing_df)

np.random.seed(0) 
random_indices = np.random.choice(training_mi_df_100.shape[0], size=20, replace=False)
random_training_set_20 = training_mi_df_100[random_indices]
random_training_labels_20 = train_class_series.iloc[random_indices]

In [13]:
naive_bayes = MultinomialNB()
naive_bayes.fit(random_training_set_20, random_training_labels_20)
naive_predict = naive_bayes.predict(testing_mi_df_100)

log_reg = LogisticRegression(solver="newton-cg", multi_class="multinomial")
log_reg.fit(random_training_set_20, random_training_labels_20)
log_reg_predict = log_reg.predict(testing_mi_df_100)

print('Naive Accuracy score: ', format(accuracy_score(naive_predict, testing_class_series)))
print('Logistic Regress Accuracy score: ', format(accuracy_score(log_reg_predict, testing_class_series)))

Naive Accuracy score:  0.959579180509413
Logistic Regress Accuracy score:  0.9307862679955703


since Naive Bayes and logistic regression models performs well despite the reduction in data,(accuracy score only goes down by a couple percent) that suggests that the assumption is holding reasonably well for this dataset.