In [110]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('final_project.csv')
df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x41,x42,x43,x44,x45,x46,x47,x48,x49,y
0,-0.166563,-3.961588,4.621113,2.481908,-1.800135,0.804684,6.718751,-14.789997,-1.040673,-4.20495,...,-1.497117,5.414063,-2.325655,1.674827,-0.264332,60.781427,-7.689696,0.151589,-8.040166,0
1,-0.149894,-0.585676,27.839856,4.152333,6.426802,-2.426943,40.477058,-6.725709,0.896421,0.330165,...,36.29279,4.490915,0.762561,6.526662,1.007927,15.805696,-4.896678,-0.320283,16.719974,0
2,-0.321707,-1.429819,12.251561,6.586874,-5.304647,-11.31109,17.81285,11.060572,5.32588,-2.632984,...,-0.368491,9.088864,-0.689886,-2.731118,0.7542,30.856417,-7.428573,-2.090804,-7.869421,0
3,-0.245594,5.076677,-24.149632,3.637307,6.505811,2.290224,-35.111751,-18.913592,-0.337041,-5.568076,...,15.691546,-7.467775,2.940789,-6.424112,0.419776,-72.424569,5.361375,1.80607,-7.670847,0
4,-0.273366,0.306326,-11.352593,1.676758,2.928441,-0.616824,-16.505817,27.532281,1.199715,-4.309105,...,-13.911297,-5.229937,1.783928,3.957801,-0.096988,-14.085435,-0.208351,-0.894942,15.724742,1


The majority of the data is float64. We will take a deeper look at the "object" variables.

In [5]:
df.dtypes

x0     float64
x1     float64
x2     float64
x3     float64
x4     float64
x5     float64
x6     float64
x7     float64
x8     float64
x9     float64
x10    float64
x11    float64
x12    float64
x13    float64
x14    float64
x15    float64
x16    float64
x17    float64
x18    float64
x19    float64
x20    float64
x21    float64
x22    float64
x23    float64
x24     object
x25    float64
x26    float64
x27    float64
x28    float64
x29     object
x30     object
x31    float64
x32     object
x33    float64
x34    float64
x35    float64
x36    float64
x37     object
x38    float64
x39    float64
x40    float64
x41    float64
x42    float64
x43    float64
x44    float64
x45    float64
x46    float64
x47    float64
x48    float64
x49    float64
y        int64
dtype: object

x24 is a column of what looks like continents, between europe, asia, america and nan. Definitely not numeric. 

In [7]:
df['x24'].unique()

array(['euorpe', 'asia', 'america', nan], dtype=object)

x29 needs to be cleaned. It is a column of months, where most months are 3 letters, but some are longer. 

In [9]:
df['x29'].unique()

array(['July', 'Aug', 'Jun', 'May', 'sept.', 'Apr', 'Nov', 'Oct', nan,
       'Mar', 'Feb', 'Dev', 'January'], dtype=object)

x30 is a column of weekdays, monday through friday. Some days are misspelled. 

In [12]:
df['x30'].unique()

array(['tuesday', 'wednesday', 'thurday', 'monday', 'friday', nan],
      dtype=object)

x32 is a column of percents that is being read in as str instead of numeric. 

In [13]:
df['x32'].unique()

array(['0.0%', '-0.02%', '-0.01%', '0.01%', '-0.03%', '0.02%', '-0.0%',
       '-0.04%', nan, '0.03%', '0.04%', '-0.05%', '0.05%'], dtype=object)

x37 is a row of dollar values that were read in as str instead of numeric. 

In [15]:
df['x37'].unique()

array(['$1313.96', '$1962.78', '$430.47', ..., '$1588.65', '$439.21',
       '$-1229.34'], dtype=object)

There are between 20 and 50 NA's in each column. 

In [17]:
df.isna().sum()

x0     26
x1     25
x2     38
x3     37
x4     26
x5     37
x6     26
x7     27
x8     21
x9     30
x10    43
x11    30
x12    36
x13    31
x14    34
x15    35
x16    26
x17    27
x18    40
x19    35
x20    38
x21    29
x22    27
x23    47
x24    28
x25    22
x26    36
x27    30
x28    35
x29    30
x30    30
x31    39
x32    31
x33    41
x34    41
x35    30
x36    27
x37    23
x38    31
x39    23
x40    36
x41    40
x42    26
x43    37
x44    40
x45    29
x46    31
x47    37
x48    32
x49    32
y       0
dtype: int64

If we were to drop all of the NA's, we would lose about 1% of the data. We can proceed with dropping NA's without fear of losing too much data. 

In [20]:
print(len(df))
print(len(df.dropna()))
print(len(df.dropna()) / len(df))

160000
158392
0.98995


Looks like we have a variety of ranges for the rest of our numeric variables. All variables have a minimum in the negatives, while all variables have a maximum in the positives. y is a binary 0 or 1, as expected. 

In [34]:
df_numeric = df.dropna()
df_numeric = df_numeric.drop(columns = ['x24', 'x29', 'x30', 'x32', 'x37'])
df_numeric.max() - df_numeric.min()

x0       3.193484
x1      54.266480
x2     122.939701
x3      72.770852
x4      54.715347
x5      69.373098
x6     178.745088
x7     330.657610
x8      76.740876
x9      55.358501
x10     74.252154
x11     74.453313
x12    137.477321
x13     81.115692
x14     63.451554
x15     30.784918
x16     48.004106
x17     71.452946
x18     39.851672
x19     69.148946
x20     54.491956
x21     89.739357
x22     48.507205
x23    125.130841
x25     11.678822
x26      7.809135
x27     60.648629
x28    140.650550
x31     23.536528
x33     15.238574
x34     70.958034
x35     19.900575
x36     13.865568
x38    164.765540
x39     43.647238
x40    162.883674
x41    182.217656
x42     50.601790
x43     13.557155
x44     37.053246
x45      3.422426
x46    352.686244
x47     41.923187
x48     16.716707
x49    132.668795
y        1.000000
dtype: float64

In [36]:
df_numeric.min()

x0      -1.592635
x1     -26.278302
x2     -59.394048
x3     -33.864827
x4     -28.467536
x5     -33.822988
x6     -86.354483
x7    -181.506976
x8     -37.691045
x9     -27.980659
x10    -36.306571
x11    -38.092869
x12    -64.197967
x13    -38.723514
x14    -30.905214
x15    -17.002359
x16    -26.042983
x17    -34.395898
x18    -20.198686
x19    -35.633396
x20    -26.677396
x21    -43.501854
x22    -23.644193
x23    -66.640341
x25     -6.364653
x26     -3.857484
x27    -32.003555
x28    -72.896705
x31    -12.289364
x33     -7.451454
x34    -36.116606
x35    -10.008149
x36     -6.866024
x38    -74.297559
x39    -22.101647
x40    -74.059196
x41    -82.167224
x42    -27.933750
x43     -6.876234
x44    -17.983487
x45     -1.753221
x46   -201.826828
x47    -21.086333
x48     -8.490155
x49    -65.791191
y        0.000000
dtype: float64

In [35]:
df_numeric.max()

x0       1.600849
x1      27.988178
x2      63.545653
x3      38.906025
x4      26.247812
x5      35.550110
x6      92.390605
x7     149.150634
x8      39.049831
x9      27.377842
x10     37.945583
x11     36.360443
x12     73.279354
x13     42.392177
x14     32.546340
x15     13.782559
x16     21.961123
x17     37.057048
x18     19.652986
x19     33.515550
x20     27.814560
x21     46.237503
x22     24.863012
x23     58.490500
x25      5.314169
x26      3.951652
x27     28.645074
x28     67.753845
x31     11.247163
x33      7.787120
x34     34.841428
x35      9.892426
x36      6.999544
x38     90.467981
x39     21.545591
x40     88.824477
x41    100.050432
x42     22.668041
x43      6.680922
x44     19.069759
x45      1.669205
x46    150.859415
x47     20.836854
x48      8.226552
x49     66.877604
y        1.000000
dtype: float64

In [38]:
df_numeric['y'].unique()

array([0, 1])

Aside from dropping the NA's, it doesn't look like we need to do much cleaning with the variables that are already numeric. Thus, cleaning will focus on 'x24', 'x29', 'x30', 'x32' and 'x37'. First, we drop NA's.

In [89]:
df_na = df.dropna()

x24 has 'euorpe', 'asia' and 'america' as possible values. We will fix the spelling of europe and capitalize the first letter of each continent name. 

In [90]:
df_na['x24'].unique()

array(['euorpe', 'asia', 'america'], dtype=object)

In [91]:
df_na['x24'] = df_na['x24'].str.replace('euorpe', 'Europe')
df_na['x24'] = df_na['x24'].str.replace('asia', 'Asia')
df_na['x24'] = df_na['x24'].str.replace('america', 'America')
df_na['x24'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x24'] = df_na['x24'].str.replace('euorpe', 'Europe')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x24'] = df_na['x24'].str.replace('asia', 'Asia')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x24'] = df_na['x24'].str.replace('america', 'America')


array(['Europe', 'Asia', 'America'], dtype=object)

We move on to the 'x29' variable, which has months in various forms. We will stick with the 3 letter form of each month, since most months are spelled that way. That means 'July', 'sept.', 'Dec' and 'January' need to be fixed. 

In [92]:
df_na['x29'].unique()

array(['July', 'Aug', 'Jun', 'May', 'sept.', 'Apr', 'Nov', 'Oct', 'Mar',
       'Feb', 'Dev', 'January'], dtype=object)

In [93]:
df_na['x29'] = df_na['x29'].str.replace('July', 'Jul')
df_na['x29'] = df_na['x29'].str.replace('sept.', 'Sep')
df_na['x29'] = df_na['x29'].str.replace('Dev', 'Dec')
df_na['x29'] = df_na['x29'].str.replace('January', 'Jan')
df_na['x29'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x29'] = df_na['x29'].str.replace('July', 'Jul')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x29'] = df_na['x29'].str.replace('sept.', 'Sep')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x29'] = df_na['x29'].str.replace('Dev', 'Dec')
A value is trying to be set on a copy of

array(['Jul', 'Aug', 'Jun', 'May', 'Sep', 'Apr', 'Nov', 'Oct', 'Mar',
       'Feb', 'Dec', 'Jan'], dtype=object)

Here, we look at the x30 variable, which seems to be days of the week. We'll capitalize like we had with the continents, and fix the spelling of 'thurday'.

In [94]:
df_na['x30'].unique()

array(['tuesday', 'wednesday', 'thurday', 'monday', 'friday'],
      dtype=object)

In [95]:
df_na['x30'] = df_na['x30'].str.replace('tuesday', 'Tuesday')
df_na['x30'] = df_na['x30'].str.replace('wednesday', 'Wednesday')
df_na['x30'] = df_na['x30'].str.replace('thurday', 'Thursday')
df_na['x30'] = df_na['x30'].str.replace('monday', 'Monday')
df_na['x30'] = df_na['x30'].str.replace('friday', 'Friday')
df_na['x30'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x30'] = df_na['x30'].str.replace('tuesday', 'Tuesday')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x30'] = df_na['x30'].str.replace('wednesday', 'Wednesday')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x30'] = df_na['x30'].str.replace('thurday', 'Thursday')
A value is try

array(['Tuesday', 'Wednesday', 'Thursday', 'Monday', 'Friday'],
      dtype=object)

Here, we investigate the 'x32' variable. These are percents that still have % at the end, so pandas interprets them as strings. We will remove the % at the end, convert to numeric, and divide each value by 100 to convert percent to decimal. Note that the result only has 11 unique values instead of 12, as -0.0% and 0.0% were interpreted as two different values initially. 

In [96]:
df_na['x32'].unique()

array(['0.0%', '-0.02%', '-0.01%', '0.01%', '-0.03%', '0.02%', '-0.0%',
       '-0.04%', '0.03%', '0.04%', '-0.05%', '0.05%'], dtype=object)

In [97]:
df_na['x32'] = df_na['x32'].str.replace('%', '')
df_na['x32'] = df_na['x32'].astype('float64')
df_na['x32'] = df_na['x32'] / 100
df_na['x32'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x32'] = df_na['x32'].str.replace('%', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x32'] = df_na['x32'].astype('float64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x32'] = df_na['x32'] / 100


array([ 0.    , -0.0002, -0.0001,  0.0001, -0.0003,  0.0002, -0.0004,
        0.0003,  0.0004, -0.0005,  0.0005])

Finally, we have x37. This variable was interpreted as a string due to the leading dollar sign. We will remove this dollar sign and convert the result to numeric. 

In [98]:
df_na['x37'].unique()

array(['$1313.96', '$1962.78', '$430.47', ..., '$1588.65', '$439.21',
       '$-1229.34'], dtype=object)

In [99]:
df_na['x37'] = df_na['x37'].str.replace('$', '').astype('float64').round(2)
df_na['x37'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_na['x37'] = df_na['x37'].str.replace('$', '').astype('float64').round(2)


array([ 1313.96,  1962.78,   430.47, ...,  1588.65,   439.21, -1229.34])

In [100]:
df_clean = df_na

We have no NA's in our df_clean, and our variables are all numeric other than x24, x29 and x30, which is expected. We are ready to move forward with EDA and modelling.

In [107]:
df_clean.isna().sum().sum()

0

In [109]:
df_clean.dtypes

x0     float64
x1     float64
x2     float64
x3     float64
x4     float64
x5     float64
x6     float64
x7     float64
x8     float64
x9     float64
x10    float64
x11    float64
x12    float64
x13    float64
x14    float64
x15    float64
x16    float64
x17    float64
x18    float64
x19    float64
x20    float64
x21    float64
x22    float64
x23    float64
x24     object
x25    float64
x26    float64
x27    float64
x28    float64
x29     object
x30     object
x31    float64
x32    float64
x33    float64
x34    float64
x35    float64
x36    float64
x37    float64
x38    float64
x39    float64
x40    float64
x41    float64
x42    float64
x43    float64
x44    float64
x45    float64
x46    float64
x47    float64
x48    float64
x49    float64
y        int64
dtype: object

In [117]:
df_clean.to_csv('Cleaned_final_dataset.csv')

In [114]:
lr = LinearRegression().fit(df_clean.drop(columns = ['y', 'x24', 'x29', 'x30']).to_numpy(), df_clean['y'].to_numpy())

In [116]:
lr.coef_

array([-2.49469094e-04,  2.81660388e-04, -6.93490854e+00,  1.55154486e-04,
        9.61388980e-05, -6.16116979e-05, -1.00828359e+01,  5.18206178e-01,
       -2.62282423e-04,  9.11735943e-05, -1.03036707e-04,  5.54499325e-05,
       -6.86673421e+00, -1.03213487e-04, -1.23800771e-04, -5.39565987e-05,
        5.01401096e-04, -1.44063211e-04,  4.27797342e-04,  2.26125244e-04,
       -2.36517097e+00,  1.46103528e-04,  1.03830394e-04, -2.61516938e+00,
        1.56131816e-03, -5.39545661e-04,  1.52924780e+01,  9.39653674e+00,
       -7.04192917e-05, -8.74014917e+01, -2.88352983e-04,  2.95160340e-04,
       -2.79726695e-04,  2.48166469e-04,  3.55450737e-01, -1.97382921e+00,
        4.52009913e-06, -7.56655867e+00, -2.18289900e+00,  5.33598532e+00,
        4.94997571e-04,  7.84689895e-05, -3.25256477e-03, -2.23202337e+00,
        2.97152596e-05, -8.98891135e+00, -5.25065338e+00])