In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from os.path import dirname
import seaborn as sns
import time
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from sklearn import linear_model

notebook_path = os.path.abspath("DataAnalyticsKickstarterNotebook_Cedrik.ipynb")
csv_path_features = os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-regression-features.csv")
csv_path_trueLabels = os.path.join(os.path.dirname(dirname(notebook_path)), "data/ks-project-edited-regression-target.csv")

In [2]:
df_features = pd.read_csv (csv_path_features, low_memory=False)
df_target = pd.read_csv (csv_path_trueLabels, low_memory=False)

In [3]:
df_features.head()

Unnamed: 0,usd_goal_real,duration,name_length,creator_type,Art,Comics,Crafts,Dance,Design,Fashion,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Technology,Theater
0,3000.0,20,22,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,5000.0,15,33,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,500.0,14,36,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,50.0,8,16,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,2000.0,81,46,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [87]:
df_new_target = df_features["usd_goal_real"]
df_features.drop(columns="usd_goal_real", inplace=True)

In [27]:
df_target.head()

Unnamed: 0,usd_pledged_real
0,3222.0
1,11486.0
2,16167.71
3,207.66
4,8647.79


In [26]:
df_merged = df_features.copy()
df_merged.insert(3, 'usd_pledged_real', df_target)
df_merged.head()

Unnamed: 0,usd_goal_real,duration,name_length,usd_pledged_real,creator_type,Art,Comics,Crafts,Dance,Design,Fashion,Film & Video,Food,Games,Journalism,Music,Photography,Publishing,Technology,Theater
0,3000.0,20,22,3222.0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,5000.0,15,33,11486.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,500.0,14,36,16167.71,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,50.0,8,16,207.66,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,2000.0,81,46,8647.79,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [4]:
print(df_target.describe().apply(lambda s: s.apply('{0:.5f}'.format)))

      usd_pledged_real
count     129580.00000
mean       10409.71634
std        77752.77396
min            0.00000
25%           52.00000
50%         1195.00000
75%         5649.25000
max      8596474.58000


In [14]:
print(df_features.iloc[:,:3].describe(percentiles=[.005,.99]).apply(lambda s: s.apply('{0:.5f}'.format)))

         usd_goal_real      duration   name_length
count     129580.00000  129580.00000  129580.00000
mean       50975.36696      32.93627      34.55207
std      1342563.25624      11.90823      15.62217
min            0.01000       1.00000       1.00000
0.5%          18.18740       6.89500       5.00000
50%         5000.00000      30.00000      34.00000
99%       332884.52570      60.00000      60.00000
max    151395869.92000      92.00000      96.00000


In [146]:
df_outlier = df_merged[(df_merged["usd_goal_real"]<=150000) & (df_merged["usd_goal_real"] >= 100)
                        &(df_merged["duration"] <= 60) & (df_merged["name_length"] <= 60)]
df_outlier.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124151 entries, 0 to 129579
Data columns (total 21 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   usd_goal_real     124151 non-null  float64
 1   duration          124151 non-null  int64  
 2   name_length       124151 non-null  int64  
 3   usd_pledged_real  124151 non-null  float64
 4   creator_type      124151 non-null  int64  
 5   Art               124151 non-null  int64  
 6   Comics            124151 non-null  int64  
 7   Crafts            124151 non-null  int64  
 8   Dance             124151 non-null  int64  
 9   Design            124151 non-null  int64  
 10  Fashion           124151 non-null  int64  
 11  Film & Video      124151 non-null  int64  
 12  Food              124151 non-null  int64  
 13  Games             124151 non-null  int64  
 14  Journalism        124151 non-null  int64  
 15  Music             124151 non-null  int64  
 16  Photography       12

In [47]:
print(df_outlier.iloc[:,:3].describe(percentiles=[.005,.99]).apply(lambda s: s.apply('{0:.5f}'.format)))

      usd_goal_real      duration   name_length
count  124151.00000  124151.00000  124151.00000
mean    12290.02701      32.61933      34.56253
std     20541.74278      11.31404      15.53016
min       100.00000       1.00000       1.00000
0.5%      100.00000       7.00000       5.00000
50%      5000.00000      30.00000      34.00000
99%    100000.00000      60.00000      60.00000
max    150000.00000      60.00000      60.00000


In [147]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
list_scaler = ["duration","name_length"]
for feature in list_scaler:
    scaler.fit(df_outlier[[feature]])
    df_outlier[feature] = scaler.fit_transform(df_outlier[[feature]])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_outlier[feature] = scaler.fit_transform(df_outlier[[feature]])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_outlier[feature] = scaler.fit_transform(df_outlier[[feature]])


In [148]:
df_outlier_target = df_outlier["usd_goal_real"]

In [149]:
df_outlier.drop(columns=["usd_pledged_real","usd_goal_real"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [53]:
model = sm.OLS(df_target,df_features).fit()

In [54]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:       usd_pledged_real   R-squared:                       0.022
Model:                            OLS   Adj. R-squared:                  0.022
Method:                 Least Squares   F-statistic:                     164.0
Date:                Mon, 13 Dec 2021   Prob (F-statistic):               0.00
Time:                        11:37:05   Log-Likelihood:            -1.6416e+06
No. Observations:              129580   AIC:                         3.283e+06
Df Residuals:                  129561   BIC:                         3.284e+06
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
usd_goal_real     0.0003      0.000      1.722

In [57]:
list_features = ["duration", "name_length"]
df_features_updated = df_features[list_features].copy()

In [58]:
model = sm.OLS(df_target,df_features_updated).fit()
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:       usd_pledged_real   R-squared (uncentered):                   0.021
Model:                            OLS   Adj. R-squared (uncentered):              0.020
Method:                 Least Squares   F-statistic:                              1356.
Date:                Mon, 13 Dec 2021   Prob (F-statistic):                        0.00
Time:                        11:46:12   Log-Likelihood:                     -1.6429e+06
No. Observations:              129580   AIC:                                  3.286e+06
Df Residuals:                  129578   BIC:                                  3.286e+06
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                  coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------

In [49]:
model = sm.OLS(df_outlier_target,df_outlier).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:       usd_pledged_real   R-squared:                       0.072
Model:                            OLS   Adj. R-squared:                  0.072
Method:                 Least Squares   F-statistic:                     536.1
Date:                Tue, 14 Dec 2021   Prob (F-statistic):               0.00
Time:                        14:15:57   Log-Likelihood:            -1.5247e+06
No. Observations:              124151   AIC:                         3.049e+06
Df Residuals:                  124132   BIC:                         3.050e+06
Df Model:                          18                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
usd_goal_real     0.5176      0.008     67.881

In [52]:
df_outlier.drop(columns=["duration"], inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [65]:
df_outlier.drop(columns=["usd_pledged_real"],inplace=True)

In [66]:
df_outlier = sm.add_constant(df_outlier)
model = sm.OLS(df_outlier_target,df_outlier).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          usd_goal_real   R-squared:                       0.107
Model:                            OLS   Adj. R-squared:                  0.107
Method:                 Least Squares   F-statistic:                     876.1
Date:                Tue, 14 Dec 2021   Prob (F-statistic):               0.00
Time:                        14:38:08   Log-Likelihood:            -1.4020e+06
No. Observations:              124151   AIC:                         2.804e+06
Df Residuals:                  124133   BIC:                         2.804e+06
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const         2962.6608    203.575     14.553   

In [157]:
df_outlier.drop(columns="target", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [163]:
df_outlier_target

0          3000.00
1          5000.00
2           500.00
5         30000.00
6          4000.00
            ...   
129574    10000.00
129575     8437.96
129577     5000.00
129578     2587.10
129579     1200.00
Name: usd_goal_real, Length: 124151, dtype: float64

In [159]:
X_train, X_test, y_train, y_test = train_test_split(df_outlier, df_outlier_target, random_state=42)

In [160]:
clf_Linear = linear_model.LinearRegression()
clf_Linear.fit(X_train, y_train)
predicted = clf_Linear.predict(X_test)
expected = y_test

In [164]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
print(mean_squared_error(expected, predicted))
print(mean_absolute_error(expected, predicted))

372271890.9718815
11361.25918648109


In [165]:
from sklearn.metrics import explained_variance_score
explained_variance_score(expected,predicted)

0.10801074522885046