In [106]:
import datetime     # timing the program
import csv
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import preprocessing, linear_model
import matplotlib.pyplot as plt
plt.rc("font", size=14)
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.feature_selection import SelectFromModel, RFE, SelectFpr

In [45]:
with open("/Users/ChantelleChiu/Documents/GitHub/Project-90106-G28/Chantelle/cleaned_15_Sep.csv") as file:
    raw_data = pd.read_csv(file, header=0)
    le = preprocessing.LabelEncoder()
    data = raw_data.apply(le.fit_transform)
    file.close()

In [46]:
X_train = data.iloc[:,1:17]
Y_train = data.iloc[:,0]

In [47]:
print(X_train.shape,Y_train.shape)
print(Y_train)

(5408, 16) (5408,)
0       0
1       1
2       1
3       1
4       1
       ..
5403    1
5404    0
5405    1
5406    1
5407    1
Name: StageName, Length: 5408, dtype: int64


In [50]:
lm = LogisticRegression(random_state=0, multi_class='ovr', solver='liblinear', penalty='l2', dual=False, 
                       tol=1e-4, fit_intercept=True, intercept_scaling=1, class_weight=None)

In [51]:
lm.fit(X_train, Y_train)

LogisticRegression(multi_class='ovr', random_state=0, solver='liblinear')

In [52]:
# cross validation
cv_result = cross_val_score(lm, X_train, Y_train, cv=10)

Running time: 0:00:00.454509 Seconds


In [70]:
# Accuracy

accuracy = cross_val_score(lm, X_train, Y_train, cv=10, scoring='accuracy')
print("mean accuracy of cv=10 on lm: %.4f" % accuracy.mean())

mean accuracy of cv=10 on lm: 0.6610


In [71]:
# Precision

precision = cross_val_score(lm, X_train, Y_train, cv=10, scoring='precision')
print("mean precision of cv=10 on lm: %.4f" % precision.mean())

mean precision of cv=10 on lm: 0.5848


In [72]:
# Recall

recall = cross_val_score(lm, X_train, Y_train, cv=10, scoring='recall')
print("mean recall of cv=10 on lm: %.4f" % recall.mean())

mean recall of cv=10 on lm: 0.3666


In [75]:
predictors = X_train
selector = RFE(lm, n_features_to_select=5, step=1)
selector.fit(X_train, Y_train)

RFE(estimator=LogisticRegression(multi_class='ovr', random_state=0,
                                 solver='liblinear'),
    n_features_to_select=5)

In [82]:
support = selector.support_
print(support)

[False False  True False False False False False False  True False  True
 False  True False  True]


In [101]:
selector.get_support(indices=True)

array([ 2,  9, 11, 13, 15])

In [81]:
order = selector.ranking_
print(order)

[ 9 11  1  5 12  2  7  8  6  1  3  1  4  1 10  1]


In [96]:
features_rank = []
for i in range(len(order)):
    features_rank.append(f"{order[i]}:   {data.columns[i]}:   {support[i]}")
    
features_rank

['9:   StageName:   False',
 '11:   RecordType.Name:   False',
 '1:   RICE_Supported__c:   True',
 '5:   Actual_Close_Date__c:   False',
 '12:   Lead_Faculty__c:   False',
 '2:   Lead_School__c:   False',
 '7:   Parent_Opportunity__c:   False',
 '8:   Industry:   False',
 '6:   Industry_Sub_Type__c:   False',
 '1:   Business_Type__c:   True',
 '3:   Is_External(1)__c:   False',
 '1:   ParentId:   True',
 '4:   CloseYear:   False',
 '1:   CloseMonth:   True',
 '10:   CreatedYear:   False',
 '1:   CreatedMonth:   True']

In [97]:
features_rank.sort()
features_rank

['10:   CreatedYear:   False',
 '11:   RecordType.Name:   False',
 '12:   Lead_Faculty__c:   False',
 '1:   Business_Type__c:   True',
 '1:   CloseMonth:   True',
 '1:   CreatedMonth:   True',
 '1:   ParentId:   True',
 '1:   RICE_Supported__c:   True',
 '2:   Lead_School__c:   False',
 '3:   Is_External(1)__c:   False',
 '4:   CloseYear:   False',
 '5:   Actual_Close_Date__c:   False',
 '6:   Industry_Sub_Type__c:   False',
 '7:   Parent_Opportunity__c:   False',
 '8:   Industry:   False',
 '9:   StageName:   False']

In [109]:
print(lm.coef_)

[[ 0.02085598 -0.00935733  0.95408494 -0.10951064  0.0033624  -0.57962174
   0.04970534 -0.03919561 -0.03939539  0.43926092 -0.1813999  -1.80938791
  -0.15238714  1.54130299 -0.01741305  1.52477968]]
