In [42]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold

In [22]:
df = pd.read_pickle('Complete_NST_Text.pkl')

In [23]:
#cleans \n slashes from text
df['text'] = df['text'].replace(r'\n', ' ', regex=True)

In [24]:
#creates a list of states
state_list = list(df['State'].groupby(df['State']).count().index)

In [25]:
#applies a number to each state
state_dict = {}
for idx, state in enumerate(state_list):
    state_dict[state] = idx

In [26]:
#drops any rows where State = NA
df = df.dropna(subset=['State'])

In [27]:
#applies the code to each row
df['state_code'] = df['State'].apply(lambda x: state_dict[x])

In [10]:
#creates test train split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['state_code'])

In [11]:
#creates a count vectorizer object
vect = CountVectorizer(stop_words='english').fit(X_train)

#uses vect object to create a sparse matrix
X_train_vectorized = vect.transform(X_train)

#creates and fits model
model = LogisticRegression(solver='newton-cg', multi_class='multinomial')
model.fit(X_train_vectorized, y_train)



predictions = model.predict(vect.transform(X_test))

In [15]:
predictions

array([26,  7, 32, ...,  1,  5,  5])

In [16]:
accuracy_score(y_test, predictions)

0.84269662921348309

In [84]:
#Borno, not Borno

In [29]:
def set_borno(x):
    if x == 'Borno':
        return 1
    else:
        return 0

In [30]:
df['Borno'] = df['State'].apply(lambda x: set_borno(x))

In [33]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['Borno'])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['Borno'])
#creates a count vectorizer object
vect_borno = CountVectorizer(stop_words='english').fit(X_train)

#uses vect object to create a sparse matrix
X_train_vectorized_borno = vect.transform(X_train)

#creates and fits model
model_borno = LogisticRegression(solver='newton-cg', multi_class='multinomial')
model.fit(X_train_vectorized_borno, y_train)



predictions = model.predict(vect.transform(X_test))

In [35]:
accuracy_score(predictions, y_test#)

0.95692883895131087

In [36]:
#Predict other states

In [40]:
no_borno = df[df['Borno']==0]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(no_borno['text'], no_borno['state_code'])
#creates a count vectorizer object
vect_borno = CountVectorizer(stop_words='english').fit(X_train)

#uses vect object to create a sparse matrix
X_train_vectorized_borno = vect.transform(X_train)

#creates and fits model
model_borno = LogisticRegression(solver='newton-cg', multi_class='multinomial')
model.fit(X_train_vectorized_borno, y_train)



predictions = model.predict(vect.transform(X_test))
accuracy_score(predictions, y_test)

0.84519350811485638

In [43]:
#check states with k-folds

In [55]:
from sklearn.model_selection import KFold, cross_val_score

In [59]:
x_train = vect.transform(no_borno['text'])

#kf = KFold(len(no_borno['state_code']), n_folds==3, shuffle=True, random_state=0)
cross_val_score(model_borno, x_train, no_borno['state_code'], n_jobs=1)


array([ 0.74025974,  0.81162137,  0.86011342])

In [60]:
nums = [0.74025974,  0.81162137,  0.86011342]

In [62]:
sum(nums)/3

0.8039981766666667