In [419]:
import pandas as pd
import matplotlib.pyplot as plt

In [420]:
df = pd.read_csv('internship_train.csv')

In [475]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 54 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       90000 non-null  int64  
 1   1       90000 non-null  int64  
 2   2       90000 non-null  int64  
 3   3       90000 non-null  int64  
 4   4       90000 non-null  int64  
 5   5       90000 non-null  int64  
 6   6       90000 non-null  float64
 7   7       90000 non-null  float64
 8   8       90000 non-null  int64  
 9   9       90000 non-null  int64  
 10  10      90000 non-null  int64  
 11  11      90000 non-null  int64  
 12  12      90000 non-null  int64  
 13  13      90000 non-null  float64
 14  14      90000 non-null  float64
 15  15      90000 non-null  float64
 16  16      90000 non-null  float64
 17  17      90000 non-null  float64
 18  18      90000 non-null  float64
 19  19      90000 non-null  float64
 20  20      90000 non-null  float64
 21  21      90000 non-null  float64
 22

In [422]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(df, test_size=0.3, random_state=87)

In [423]:
X, y = train_set.drop('target', axis=1), train_set['target']
X_test, y_test = test_set.drop('target', axis=1), test_set['target']

In [424]:
corr_matrix = df.corr()
corr_matrix['target']

0         0.002427
1        -0.005545
2         0.001003
3         0.002807
4        -0.001300
5        -0.002125
6         0.000666
7         0.012103
8         0.000347
9         0.001504
10        0.003396
11       -0.001566
12        0.002016
13       -0.001009
14       -0.000513
15        0.001653
16       -0.000853
17        0.002953
18        0.001197
19        0.001520
20       -0.000456
21        0.002874
22        0.000304
23        0.001280
24        0.001070
25        0.004276
26        0.006370
27       -0.001459
28        0.003451
29       -0.004785
30       -0.000964
31        0.005077
32        0.000561
33       -0.002301
34        0.001331
35       -0.006267
36        0.002624
37        0.001642
38       -0.003845
39        0.006283
40       -0.007216
41       -0.003770
42       -0.002876
43        0.000489
44        0.003092
45       -0.000903
46        0.001160
47        0.000319
48       -0.001183
49        0.002473
50       -0.005243
51       -0.004507
52       -0.

# Linear correlation between `target` and each of the features appears to be very low

In [425]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 63000 entries, 53722 to 60902
Data columns (total 53 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       63000 non-null  int64  
 1   1       63000 non-null  int64  
 2   2       63000 non-null  int64  
 3   3       63000 non-null  int64  
 4   4       63000 non-null  int64  
 5   5       63000 non-null  int64  
 6   6       63000 non-null  float64
 7   7       63000 non-null  float64
 8   8       63000 non-null  int64  
 9   9       63000 non-null  int64  
 10  10      63000 non-null  int64  
 11  11      63000 non-null  int64  
 12  12      63000 non-null  int64  
 13  13      63000 non-null  float64
 14  14      63000 non-null  float64
 15  15      63000 non-null  float64
 16  16      63000 non-null  float64
 17  17      63000 non-null  float64
 18  18      63000 non-null  float64
 19  19      63000 non-null  float64
 20  20      63000 non-null  float64
 21  21      63000 non-null  float64

In [None]:
for col in X.columns:
  plt.scatter(X[col][:1000], y[:1000], alpha=0.1)
  plt.xlabel(col)
  plt.ylabel('Target')
  plt.show()
  print(col)

In [None]:
for col in X.columns:
  X[col].hist()
  plt.show()
  print(col)

# Most features seem to have a random distribution, while features `6` and `8` seem to be the only two that have a meaningful relationship with `target`

In [428]:
import numpy as np
is_any_row_value_null = np.any(X_non_random.isnull(), axis=1)
np.any(is_any_row_value_null)

False

#This means that there are no null values in the dataset

# Feature `6` seems to be continuous, while `8` looks like a one-hot feature

#Feature Engineering:

#Since feature `6` shows a `y = x^2` relationship, we could transform it into a linear relationship by squaring `x`.

In [429]:
from sklearn.base import BaseEstimator, TransformerMixin
class SquaredFeatureAdder(BaseEstimator, TransformerMixin):
  def __init__(self, feature):
    self.feature = feature
  def fit(self, X, y=None):
    return self
  def transform(self, X, y=None):
    squared = X[self.feature] ** 2
    return np.c_[X, squared]

In [430]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('squarer', SquaredFeatureAdder(feature='6')),
    ('scaler', StandardScaler())
])
full_pipeline = ColumnTransformer([
    ('num', num_pipeline, ['6']),
    ('cat', 'passthrough', ['8'])
])

In [431]:
X_train_prepared = full_pipeline.fit_transform(X)
X_test_prepared = full_pipeline.fit_transform(X_test)

In [432]:
X_train_prepared.shape

(63000, 3)

In [433]:
from sklearn.model_selection import cross_val_score

In [434]:
def display_scores(scores):
    print("Scores: ", scores)
    print("Mean: ", scores.mean())
    print("Std: ", scores.std())

#Let's try out Lasso, Linear and RandomForest regression

In [435]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso()
lasso_reg.fit(X_train_prepared, y)

Lasso()

In [436]:
from sklearn.model_selection import GridSearchCV
lasso_param_grid = [
    {'alpha': [0.0001, 0.0002, 0.0005, 0.001, 0.002, 0.01]}
]
grid_search = GridSearchCV(lasso_reg, lasso_param_grid, cv=5, 
                           scoring='neg_mean_squared_error', return_train_score=True)

In [437]:
grid_search.fit(X_train_prepared, y)
grid_search.best_params_

{'alpha': 0.0002}

In [438]:
lasso_score = np.sqrt(-grid_search.best_score_)

In [439]:
lasso_reg = grid_search.best_estimator_

In [440]:
lasso_cv_scores = np.sqrt(-cross_val_score(lasso_reg, X_test_prepared, y_test, 
                                  scoring='neg_mean_squared_error', 
                                  cv=5))
display_scores(lasso_cv_scores)

Scores:  [0.28835506 0.28669982 0.29165345 0.28738248 0.28971015]
Mean:  0.28876019413514475
Std:  0.0017652603186202186


#Linear regression without regularization:

In [441]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_cv_scores = np.sqrt(-cross_val_score(lin_reg, X_test_prepared, y_test, 
                                  scoring='neg_mean_squared_error', 
                                  cv=5))

In [442]:
display_scores(lin_cv_scores)

Scores:  [0.28835766 0.28679067 0.29168315 0.28737188 0.28965617]
Mean:  0.2887719053032212
Std:  0.0017500997849059041


In [443]:
from sklearn.ensemble import RandomForestRegressor

tree_reg = RandomForestRegressor()
tree_reg.fit(X_train_prepared, y)

RandomForestRegressor()

In [444]:
tree_cv_scores = np.sqrt(-cross_val_score(tree_reg, X_test_prepared, y_test, 
                                  scoring='neg_mean_squared_error', 
                                  cv=5))

In [445]:
display_scores(tree_cv_scores)

Scores:  [0.28979871 0.28822041 0.29362925 0.28921964 0.29158716]
Mean:  0.2904910354362783
Std:  0.0019131857938237073


In [446]:
y.describe()

count    63000.000000
mean        50.091696
std         28.889634
min          0.003404
25%         25.193959
50%         50.032408
75%         75.184237
max         99.997653
Name: target, dtype: float64

# It can be seen from the above that the linear models perform better than RandomForest on this dataset 

#Let's now predict for the actual test set

#The scores of the Lasso and Linear regressors that were trained only become different in the 5th digit after the comma. Even though Lasso regression performs slightly worse, I'll prefer it to have a slightly less biased model.

In [450]:
X_final_test = pd.read_csv('internship_hidden_test.csv')

In [451]:
X_final_test_prepared = full_pipeline.fit_transform(X_final_test)

In [453]:
final_predictions = lasso_reg.predict(X_final_test_prepared)

In [457]:
final_predictions_df = pd.DataFrame(final_predictions, columns=['target'])
final_predictions_df.to_csv("internship_hidden_test_predictions.csv")