In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [117]:
from os.path import join
path = join('drive', 'MyDrive', 'Project', 'Data', 'complete')

In [118]:
import pandas as pd
import numpy as np
df = pd.concat([pd.read_csv(join(path, f'file_00{"0" +str(i) if i in range(10) else i}.csv')) for i in range(17)], ignore_index=True)

In [None]:
df.drop(columns=['time'], inplace=True)

In [None]:
historical_vars =  ['historical_backers', 'historical_launch', 'historical_state', 'historical_pledges']

In [None]:
df.columns = df.columns[:-4].tolist() + historical_vars

In [None]:
def get_statistics(entry):
  
  if type(entry['historical_launch']) == str:
    delta_launch = entry['launched_at'] - np.array(eval(entry['historical_launch']))
    entry['historical_delta_launch'] = delta_launch
    success_rate = np.mean(list(map(lambda x: x == 'successful', eval(entry['historical_state']))))
    entry['historical_success_rate'] = success_rate

    for var_name in ['historical_backers', 'historical_delta_launch', 'historical_pledges']:
      var = entry[var_name]
      if type(var) == str:
        var_array = np.array(eval(var))
      else:
        var_array = np.array(var)
      entry[f"{var_name}_mean"] = var_array.mean()
      entry[f"{var_name}_std"] = var_array.std()
      entry[f"{var_name}_median"] = np.median(var_array)
      entry[f"{var_name}_max"] = var_array.max()
      entry[f"{var_name}_min"] = var_array.min()
    entry[f"historical_projects"] = len(var_array)
  else:
    entry[f"historical_projects"] = 0
  return entry
hist_df = df.apply(get_statistics, axis=1).drop(columns=historical_vars+['historical_delta_launch'])

In [None]:
hist_df.to_csv(join(path, 'complete_historical_variables.csv'), index=False)

In [None]:
hist_df.shape

(250593, 33)

In [None]:
hist_df.columns

Index(['backers_count', 'category', 'country', 'created_at', 'creator_id',
       'currency', 'deadline', 'disable_communication', 'fx_rate', 'goal',
       'historical_backers_max', 'historical_backers_mean',
       'historical_backers_median', 'historical_backers_min',
       'historical_backers_std', 'historical_delta_launch_max',
       'historical_delta_launch_mean', 'historical_delta_launch_median',
       'historical_delta_launch_min', 'historical_delta_launch_std',
       'historical_pledges_max', 'historical_pledges_mean',
       'historical_pledges_median', 'historical_pledges_min',
       'historical_pledges_std', 'historical_projects',
       'historical_success_rate', 'id', 'launched_at', 'project_url',
       'state_changed_at', 'sub_category', 'year'],
      dtype='object')

In [128]:
import pandas as pd
path = join('drive', 'MyDrive', 'Project', 'Data')
final = pd.read_csv(join(path, 'final_dataset.csv'))

In [8]:
reduced = final[~final['historical_backers_max'].isna()].loc[:, [col for col in final.columns if 'historical' in col]]

In [37]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
scaled = scaler.fit_transform(X = reduced)
pca = PCA(n_components=0.9)
pca.fit(scaled)

PCA(copy=True, iterated_power='auto', n_components=0.9, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [39]:
scaled_pca = MinMaxScaler().fit_transform(pca.transform(scaled))

In [40]:
scaled_pca

array([[5.28175799e-03, 1.78690881e-01, 5.06875704e-01, 3.88627424e-01,
        2.87437096e-01],
       [2.09879169e-04, 2.73457685e-01, 4.83955832e-01, 4.18758879e-01,
        2.85940689e-01],
       [2.22954727e-02, 2.24000545e-01, 5.08567959e-01, 4.17567919e-01,
        2.90672427e-01],
       ...,
       [1.36687408e-03, 2.52368980e-01, 4.85425060e-01, 4.12659974e-01,
        2.88797671e-01],
       [5.21725067e-03, 2.20532357e-01, 4.71316057e-01, 4.17265260e-01,
        2.89057274e-01],
       [1.94622025e-03, 2.18966657e-01, 4.71534108e-01, 4.28694956e-01,
        2.85419755e-01]])

In [44]:
reduced[[f'historic_pc{i}' for i in range(scaled_pca.shape[1])]] = scaled_pca

In [45]:
final.loc[reduced.index, [f'historic_pc{i}' for i in range(scaled_pca.shape[1])]] = scaled_pca

In [53]:
final.loc[:, [col for col in final.columns if 'historic_' in col]] = final.loc[:, [col for col in final.columns if 'historic_' in col]].fillna(0)

In [56]:
final.loc[:, 'usd_pledged'] = final.loc[:, 'usd_pledged'].fillna(0)

In [65]:
np.corrcoef([final.usd_pledged,final.historic_pc0])

array([[1.        , 0.37768351],
       [0.37768351, 1.        ]])

In [78]:
reduced = final[~final.loc[:, 'collabs_avg_score'].isna()]
final.loc[reduced.index, 'collabs_avg_score'] = MinMaxScaler().fit_transform(reduced[['collabs_avg_score']])

In [82]:
final.loc[:, 'collabs_avg_score'] = final['collabs_avg_score'].fillna(0)

In [83]:
np.corrcoef([final.usd_pledged, final.collabs_avg_score])

array([[1.        , 0.39370738],
       [0.39370738, 1.        ]])

In [87]:
final = final.drop(columns=[col for col in final.columns if 'historical_' in col])

In [114]:
final = final.drop(columns = ['pledged', 'backers_count', 'created_at', 'creator_id', 'currency', 'state_changed_at', 'year'])

In [129]:
final.to_csv(join(path, 'final_dataset.csv'), index=False)