In [1]:
import os
csvs_dir = os.path.abspath(os.path.join('../csvs'))

In [2]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
creatives_df = pd.read_csv(os.path.join(csvs_dir,'creatives.csv')).set_index('PROJECT_ID')
cpms_df = pd.read_csv(os.path.join(csvs_dir,'cpms.csv')).set_index('PROJECT_ID')
project_df = pd.read_csv(os.path.join(csvs_dir,'project.csv')).set_index('PROJECT_ID')

In [4]:
df_final = creatives_df.join(cpms_df[['centrality']], how='inner').join(project_df[['density']], how = 'inner')

In [5]:
df_final['IS_ONTIME'] = df_final.IS_ONTIME*1

In [6]:
df_final = df_final[['IS_ONTIME', 'degree_centrality_var', 'degree_entropy',
       'eigen_centrality_var', 'eigen_entropy', 'centrality',
       'density','n_users']]

In [7]:
df_final.dropna(inplace=True)

In [None]:
degree_sequence = sorted((d for n, d in G.degree()), reverse=True)

In [8]:
scaler = preprocessing.StandardScaler()
enc = preprocessing.OneHotEncoder(handle_unknown='ignore')

In [9]:
X = df_final[['eigen_centrality_var','centrality', 'density','n_users']]
y = df_final[['IS_ONTIME']]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [12]:
1 - y_test.sum()/len(y_test)

IS_ONTIME    0.848901
dtype: float64

In [13]:
clf = SVC().fit(X_train, y_train)
from sklearn.metrics import classification_report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92       309
           1       0.00      0.00      0.00        55

    accuracy                           0.85       364
   macro avg       0.42      0.50      0.46       364
weighted avg       0.72      0.85      0.78       364



  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
clf = DecisionTreeClassifier(max_depth=10)
clf .fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.93      0.88       309
           1       0.04      0.02      0.03        55

    accuracy                           0.79       364
   macro avg       0.44      0.47      0.45       364
weighted avg       0.72      0.79      0.75       364



In [17]:
clf.feature_importances_

array([0.41863278, 0.24870599, 0.14330533, 0.18935589])

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.93      0.88       309
           1       0.04      0.02      0.03        55

    accuracy                           0.79       364
   macro avg       0.44      0.47      0.45       364
weighted avg       0.72      0.79      0.75       364

