In [40]:
import pandas as pd

# Importing data file for positive trafficking examples
pos_file = '/dfs/scratch0/jdunnmon/data/memex-data/predictive_model/trafficking_docs.tsv'
neg_file = '/dfs/scratch0/jdunnmon/data/memex-data/predictive_model/trafficking_docs_neg.tsv'

# Positive file dataframe
pos_df = pd.read_csv(pos_file,'\t')
neg_df = pd.read_csv(neg_file,'\t')

In [41]:
tot_df = pos_df.append(neg_df).reset_index()

In [105]:
# Creating dataframe with dummy variables for regression
explanatory_vars = ['area_code','weekday','domain']
df_dummies = pd.get_dummies(tot_df, columns=['incall_outcall','ethnicity','hour']+explanatory_vars)
#X = df_dummies.drop(['doc_id', 'trafficking'], axis=1).values
#tot_df = tot_df.drop(['domain','hour','weekday','area_code'],axis=1)
#df_dummies = pd.get_dummies(tot_df, columns=['incall_outcall','ethnicity'])
df_dummies = df_dummies.drop(['doc_id', 'trafficking','index'],axis=1)
df_dummies['price_hour'] = df_dummies['price_hour']/100
df_dummies['price_half_hour'] = df_dummies['price_half_hour']/100
X = df_dummies.values
y = tot_df['trafficking'].values

In [106]:
from sklearn.model_selection import train_test_split

# Splitting for train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=50)

In [107]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression()
classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [108]:
classifier.score(X_test, y_test)

0.92991631799163177

In [109]:
import numpy as np
coef = classifier.coef_[0]
coef_args = np.argsort(np.abs(coef))[::-1]
num_vars = len(coef)
top_vars = [df_dummies.keys()[coef_args[a]] for a in range(num_vars)]

In [110]:
for a,b in enumerate(top_vars):
    print(f"{top_vars[a]}: {coef[coef_args][a]:.4f}")

area_code_0: -5.0627
area_code_616: 3.9462
domain_escortsinthe.com: -3.8292
domain_myproviderguide.com: 3.8251
weekday_Thursday: -3.2969
area_code_208: 3.2450
area_code_775: 3.1681
domain_liveescortreviews.com: -3.0850
area_code_214: 3.0473
area_code_312: 3.0244
weekday_Tuesday: -2.6804
area_code_972: 2.4946
area_code_423: 2.4824
weekday_Saturday: 2.4234
weekday_Sunday: 2.3992
area_code_773: 2.3851
area_code_316: 2.3618
weekday_Friday: 2.3274
weekday_Wednesday: -2.3236
domain_eroticmugshots.com: 2.2483
area_code_281: 2.2470
area_code_614: 2.2370
area_code_410: 2.1766
area_code_540: 2.1657
area_code_424: 2.1335
weekday_Monday: 2.1314
area_code_954: 2.1245
area_code_216: 2.1083
area_code_231: 2.0726
area_code_630: 2.0545
area_code_404: 2.0529
area_code_719: 2.0303
area_code_678: 2.0057
area_code_313: 2.0037
area_code_302: 1.9543
area_code_636: 1.9193
domain_massagetroll.com: 1.7902
area_code_505: 1.7404
area_code_985: 1.6206
area_code_336: 1.5893
area_code_570: 1.5668
area_code_647: -1.5

In [111]:
from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(X_test)
conf = confusion_matrix(y_test,y_pred,)
print(conf/np.sum(conf.flatten()))
print(conf)

[[ 0.48091004  0.03399582]
 [ 0.03608787  0.44900628]]
[[1839  130]
 [ 138 1717]]


In [96]:
len(X_train)

7762