In [56]:
import numpy as np
import pandas as pd
from textblob import TextBlob
from nltk.corpus import stopwords
from sklearn.metrics import recall_score, precision_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.externals import joblib
from textblob.classifiers import DecisionTreeClassifier, NaiveBayesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

en_stopwords = stopwords.words('english')

In [70]:
def train(df, target): # short cut function for this dang thing... rather nice for future use.
    return train_test_split(
        df,      # [[col for col in df if col != target]],
        target,
        test_size = .2,
        random_state = 42
    )

In [58]:
def evaluate_model(algorithm, train_test):
    train_X, test_X,train_y,test_y = train_test
    model = algorithm().fit(train_X, train_y.values.ravel())
    score = model.score(test_X,test_y)
    print (f" accuracy {score}")
    return model, score

In [59]:
def k_fold (df, target, algorithm):
    scores = []
    features #= df[[col for col in df if col != target]]
    target #= df[target]
    kf = KFold(n_splits = 5, random_state=42)
    
    for train_i, test_i, in kf.split(df):
        scores.append(evaluate_model(
            algorithm,
            (features.iloc[train_i], features.iloc[test_i], 
             target.iloc[train_i], target.iloc[test_i]
            ))[1]
        )
    return sum(scores)/ len(scores)

In [79]:
def confusion_(y, predict):
    return pd.DataFrame(data = confusion_matrix(y, predict), 
                       columns= ['not','deadbeats'], 
                       index= ['not', 'deadbeats'])

In [61]:
df = pd.read_csv('new_export_dataframe_.csv')
df.columns = ['drop1', 'drop2', 'drop3','drop4', 'target','features']
df1 = df[['target','features']]

In [62]:
df1['features'].apply( # remuving stopwords. 
    lambda x: ' '.join([w for w in x.lower().split() if w not in en_stopwords]))

# create deadbeats as target.
df1['target_d'] = df1['target'].apply(lambda x: True if x == 'Deadbeats' else False)

# reduce data set to eliminate inbalace
temp = df1.query('target_d == 0').sample(900)
temp_d = df1.query('target_d  == 1')
reduced_set = pd.concat([temp, temp_d])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [63]:
target = reduced_set['target_d']
features = reduced_set['features']


train_X,test_X,train_y,test_y = train(features, target )

In [64]:
# naive bays
train = pd.concat([train_X, train_y], axis=1)
test = pd.concat([test_X, test_y], axis=1)

tbbays = NaiveBayesClassifier(train.values)
tbbays.accuracy(test.values)

0.7098591549295775

In [67]:
tfidf = TfidfVectorizer()
word_count_vectors = tfidf.fit_transform(reduced_set['features'].apply(
    lambda x: ' '.join([w for w in x.lower().split() if w not in en_stopwords])
).values).todense().tolist()
features = pd.DataFrame(
    data = word_count_vectors,
    columns = tfidf.get_feature_names()
)

In [71]:
train_X,test_X,train_y,test_y = train(features, target )

## random forest

In [72]:
forest = RandomForestClassifier(max_depth=20, n_estimators=100).fit(train_X, train_y.values.ravel())
forest.score(test_X,test_y)

0.6676056338028169

In [73]:
k_forest = k_fold(features, target,RandomForestClassifier) # kfold forest



 accuracy 0.5887323943661972




 accuracy 0.384180790960452




 accuracy 0.6101694915254238




 accuracy 0.3700564971751412




 accuracy 0.307909604519774


In [81]:
confusion_(test_y, forest.predict(test_X))

Unnamed: 0,not,deadbeats
not,151,25
deadbeats,93,86


## Naive bayes

In [80]:
naive = evaluate_model(GaussianNB, train(features, target))[0]
confusion_(test_y, naive.predict(test_X))

 accuracy 0.6112676056338028


Unnamed: 0,not,deadbeats
not,62,114
deadbeats,24,155


In [76]:
k_fold(features, target, GaussianNB ) # kfold naivebays. 

 accuracy 0.7633802816901408
 accuracy 0.6977401129943502
 accuracy 0.5451977401129944
 accuracy 0.6073446327683616
 accuracy 0.6751412429378532


0.65776080210074

## Knearest neighbor

In [77]:
knn = evaluate_model(KNeighborsClassifier,train(features, target))[0]
confusion_(test_y, knn.predict(test_X))

 accuracy 0.5464788732394367


Unnamed: 0,True,False
True,138,38
False,123,56


In [78]:
k_fold(features, target,KNeighborsClassifier)

 accuracy 0.5690140845070423
 accuracy 0.5847457627118644
 accuracy 0.576271186440678
 accuracy 0.3728813559322034
 accuracy 0.3926553672316384


0.49911355136468527