In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold

In [2]:
df = pd.read_pickle('Complete_NST_Text.pkl')

#cleans \n slashes from text
df['text'] = df['text'].replace(r'\n', ' ', regex=True)

#creates a list of states
state_list = list(df['State'].groupby(df['State']).count().index)

#applies a number to each state
state_dict = {}
for idx, state in enumerate(state_list):
    state_dict[state] = idx

#drops any rows where State = NA
df = df.dropna(subset=['State'])

#applies the code to each row
df['state_code'] = df['State'].apply(lambda x: state_dict[x])

In [10]:
len(df)

4271

In [7]:
#creates test train split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['state_code'])

In [8]:
#creates a count vectorizer object
vect = CountVectorizer(stop_words='english', ngram_range=(2,3)).fit(X_train)

#uses vect object to create a sparse matrix
X_train_vectorized = vect.transform(X_train)

#creates and fits model
model = LogisticRegression(solver='newton-cg', multi_class='multinomial')
model.fit(X_train_vectorized, y_train)



predictions = model.predict(vect.transform(X_test))
accuracy_score(y_test, predictions)

0.7443820224719101

In [15]:
predictions

array([26,  7, 32, ...,  1,  5,  5])

0.84269662921348309

In [84]:
#Borno, not Borno

In [29]:
def set_borno(x):
    if x == 'Borno':
        return 1
    else:
        return 0

In [30]:
df['Borno'] = df['State'].apply(lambda x: set_borno(x))

In [33]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['Borno'])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['Borno'])
#creates a count vectorizer object
vect_borno = CountVectorizer(stop_words='english').fit(X_train)

#uses vect object to create a sparse matrix
X_train_vectorized_borno = vect.transform(X_train)

#creates and fits model
model_borno = LogisticRegression(solver='newton-cg', multi_class='multinomial')
model.fit(X_train_vectorized_borno, y_train)



predictions = model.predict(vect.transform(X_test))

In [35]:
accuracy_score(predictions, y_test#)

0.95692883895131087

In [36]:
#Predict other states

In [40]:
no_borno = df[df['Borno']==0]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(no_borno['text'], no_borno['state_code'])
#creates a count vectorizer object
vect_borno = CountVectorizer(stop_words='english').fit(X_train)

#uses vect object to create a sparse matrix
X_train_vectorized_borno = vect.transform(X_train)

#creates and fits model
model_borno = LogisticRegression(solver='newton-cg', multi_class='multinomial')
model.fit(X_train_vectorized_borno, y_train)



predictions = model.predict(vect.transform(X_test))
accuracy_score(predictions, y_test)

0.84519350811485638

In [69]:
def log_reg(x,y):
    X_train, X_test, y_train, y_test = train_test_split(x, y)
    #creates a count vectorizer object
    vect = CountVectorizer(stop_words='english').fit(X_train)

    #uses vect object to create a sparse matrix
    X_train_vectorized = vect.transform(X_train)

    #creates and fits model
    model = LogisticRegression(solver='newton-cg', multi_class='multinomial')
    model.fit(X_train_vectorized, y_train)



    predictions = model.predict(vect.transform(X_test))
    return accuracy_score(predictions, y_test)
    

In [43]:
#check states with k-folds

In [55]:
from sklearn.model_selection import KFold, cross_val_score

In [59]:
x_train = vect.transform(no_borno['text'])

#kf = KFold(len(no_borno['state_code']), n_folds==3, shuffle=True, random_state=0)
cross_val_score(model_borno, x_train, no_borno['state_code'], n_jobs=1)


array([ 0.74025974,  0.81162137,  0.86011342])

In [60]:
nums = [0.74025974,  0.81162137,  0.86011342]

In [62]:
sum(nums)/3

0.8039981766666667

In [63]:
#lgas

In [66]:
#creates a list of states
lga_list = list(df['LGA'].groupby(df['LGA']).count().index)

#applies a number to each state
lga_dict = {}
for idx, lga in enumerate(lga_list):
    lga_dict[lga] = idx

#drops any rows where State = NA
df = df.dropna(subset=['LGA'])

#applies the code to each row
df['lga_code'] = df['LGA'].apply(lambda x: lga_dict[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [70]:
log_reg(df['text'], df['lga_code'])

KeyboardInterrupt: 

In [76]:
def check_text(x):
    if "local government area" in x.lower():
        return 'local government area'
    elif "lga" in x.lower():
        return 'lga'
    elif "community" in x.lower():
        return 'community'
    else:
        return "none"

In [77]:
df['lga_check'] = df['text'].apply(lambda x: check_text(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [82]:
len(df[df['lga_check']=='none'])

1892

In [83]:
len(df[df['lga_check']=='local government area'])

1615

In [84]:
len(df[df['lga_check']=='lga'])

48

In [85]:
df['text'].iloc[3000]

"caption: Gov. Kashim Shettima inspecting Budum market after Boko Haram insurgents attacked the market  Suspected Boko Haram islamists shot and killed Mohammed Ali Lawal, the district head of Bulabulin in Maiduguri around 8: am today in Maiduguri.  Alhaji Ali's attackers numbering 3 also shot and killed his 9 years old daughter. A neighbor's daughter was also caught in the cross fire. Family sources claimed she sustained serious injuries.  Since its resurgence about a year ago, the northeasatern Nigeria based islamist sect has targetted and killed several districts heads, islamic scholars, local officials and security agents in a campaign of violence."

In [88]:
vect.

{'people': 18897,
 'killed': 13589,
 'injured': 12265,
 'twin': 24200,
 'suicide': 22920,
 'attacks': 3167,
 'konduga': 13737,
 'borno': 4347,
 'state': 22617,
 'tuesday': 24133,
 'source': 22345,
 'civilian': 5407,
 'joint': 13072,
 'task': 23335,
 'force': 9716,
 'area': 2810,
 'said': 21185,
 'death': 6761,
 'toll': 23782,
 'climb': 5483,
 'past': 18781,
 '30': 498,
 'rescue': 20571,
 'operations': 18083,
 'ongoing': 17955,
 'spoke': 22475,
 'correspondent': 6183,
 'worker': 25655,
 'disclosed': 7355,
 'bombers': 4295,
 'stormed': 22728,
 'local': 14476,
 'market': 15089,
 'detonated': 7151,
 'explosives': 9037,
 'strapped': 22745,
 'bodies': 4237,
 'afternoon': 1618,
 'bringing': 4512,
 'victims': 25101,
 'hospital': 11332,
 'told': 23777,
 'lying': 14638,
 'site': 22056,
 'blasts': 4151,
 '00': 0,
 'attack': 3162,
 'launched': 14163,
 'madurari': 14707,
 'community': 5721,
 'outskirts': 18447,
 'second': 21490,
 'involved': 12538,
 'according': 1201,
 'witnesses': 25618,
 'details