In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
import platform
print('Python '+str(platform.python_version()))
print('Pandas '+str(pd.__version__))
print('NumPy '+str(np.__version__))

Python 3.6.9
Pandas 1.0.3
NumPy 1.18.2


### Database without city_state

Download the 'dataset.csv' to the folder 'db/'

In [3]:
file_name = "dataset.csv"

df = pd.read_csv("db/"+file_name)

Remove the data with 'not_commenter' =0.0

In [4]:
df = df.drop(index = df.loc[df['not_commenter']==0].index)
df.reset_index(drop=True, inplace=True)

Extracting the target feature to be predicted

In [5]:
y = df['not_commenter']
df = df.drop(columns=['not_commenter'])

Get the one-hot dataframe for campaigns

In [6]:
campaigns = []
for name in df.columns:
    if "campaign." in name:
        campaigns.append(name)

campaigns_df = df[campaigns]
df = df.drop(columns=campaigns)

Create a dataframe with the BOW

In [7]:
words = []
for w in df.columns:
    if "text." in w:
        words.append(w)

bow_df = df[words]
df = df.drop(columns=words)

Create a dataframe with the cities

In [8]:
city = []
for c in df.columns:
    if "city_state." in c:
        city.append(c)

city_df = df[city]
df = df.drop(columns=city)

Convert email hash to number of occurence

In [9]:
df['email_hash']=df['email_hash'].replace(df['email_hash'].value_counts().to_dict())

Normalize the data

In [10]:
normalized_bow=(bow_df-bow_df.min())/(bow_df.max()-bow_df.min())
normalized_df=(df-df.min())/(df.max()-df.min())

bow_mm = (bow_df.max(),bow_df.min())
df_mm = (df.max(),df.min())
path = 'results/'
pickle.dump(bow_mm, open( path+'bow_mm.p', "wb" ))
pickle.dump(df_mm, open( path+'df_mm.p', "wb" ))

Generate the subsets

In [11]:
baseline_df = pd.concat([normalized_df[['date_disseminated','date_received','submitted','email_confirmation']]],axis=1)
baseline_df.to_csv('db/B1.csv',index=False)

baseline_cam_df = pd.concat([baseline_df,campaigns_df],axis=1)
baseline_cam_df.to_csv('db/B2.csv',index=False)

baseline_cam_bins_df = pd.concat([baseline_cam_df,normalized_df[['campaign_centered_bin','campaign_submitted_bin']]],axis=1)
baseline_cam_bins_df.to_csv('db/B3.csv',index=False)

baseline_bow_df = pd.concat([baseline_df,normalized_bow],axis=1)
baseline_bow_df.to_csv('db/B4.csv',index=False)

baseline_bow_bins_df = pd.concat([baseline_bow_df,normalized_df[['all_centered_bin','all_submitted_bin']]],axis=1)
baseline_bow_bins_df.to_csv('db/B5.csv',index=False)

y.to_csv('db/Y.csv',index=False)

### PCA Database

Download Rafaels PCA database. It was extracted to '../db/PCANC' here with the subdirectories:

- '../db/PCANC/0/'
- '../db/PCANC/1/'
- '../db/PCANC/2/'
- '../db/PCANC/3/'
- '../db/PCANC/4/'

In [12]:
path = 'db/PCANC/'

The LR code uses the full database and the Cross Validation is already implemented and already. In the PCA database the data is separated in ten folds + test set, so we need to merge all to use later on the LR notebook.

In [13]:
for i in range(5):
    Xfn = []
    yfn = []
    Xfn.append(path+str(i)+'/X_test_PCA_'+str(i)+'.csv')
    yfn.append(path+str(i)+'/y_test_'+str(i)+'.csv')
    for j in range(10):
        Xfn.append(path+str(i)+'/X_PCA_'+str(i)+'_'+str(j+1)+'.csv')
        yfn.append(path+str(i)+'/y_'+str(i)+'_'+str(j+1)+'.csv')
    
    for j in range(len(Xfn)):
        X = Xfn
        X = [pd.read_csv(f, header=None) for f in X]
        X = pd.concat(X)
        X.reset_index(drop=True, inplace=True)
        y = yfn
        y = [pd.read_csv(f, header=None) for f in y]
        y = pd.concat(y)
        y.reset_index(drop=True, inplace=True)

    X.to_csv('db/PCANC/XPCANC_'+str(i)+'.csv',index=False)
    y.to_csv('db/PCANC/yPCANC_'+str(i)+'.csv',index=False)