In [186]:
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler    
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score

In [3]:
np.set_printoptions(suppress=True)
pd.set_option('display.max_columns',8000)
pd.set_option('display.max_rows',7000)

In [4]:
from google.colab import files
files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [260]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [6]:
train.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [22]:
(train.isnull().sum()/train.shape[0])*100

Product_ID                    0.0
Gender                        0.0
Age                           0.0
Occupation                    0.0
City_Category                 0.0
Stay_In_Current_City_Years    0.0
Marital_Status                0.0
Product_Category_1            0.0
Purchase                      0.0
dtype: float64

In [21]:
(test.isnull().sum()/test.shape[0])*100

Product_ID                    0.0
Gender                        0.0
Age                           0.0
Occupation                    0.0
City_Category                 0.0
Stay_In_Current_City_Years    0.0
Marital_Status                0.0
Product_Category_1            0.0
dtype: float64

In [12]:
Test_userid=test['User_ID'].values
y=train[['Purchase']].values

In [27]:
train.nunique()

Product_ID                    3631
Gender                           2
Age                              7
Occupation                      21
City_Category                    3
Stay_In_Current_City_Years       5
Marital_Status                   2
Product_Category_1              20
dtype: int64

In [26]:
test.nunique()

Product_ID                    3491
Gender                           2
Age                              7
Occupation                      21
City_Category                    3
Stay_In_Current_City_Years       5
Marital_Status                   2
Product_Category_1              18
dtype: int64

In [17]:
print(train.shape)
print(test.shape)

(550068, 12)
(233599, 11)


In [18]:
train=train.drop(labels=['User_ID','Product_Category_2','Product_Category_3'], axis=1)
test=test.drop(labels=['User_ID','Product_Category_2','Product_Category_3'],axis=1)

In [23]:
train=train.drop(labels=['Purchase'], axis=1)

In [100]:
df=train.append(test)

In [101]:
df.nunique()

Product_ID                    3677
Gender                           2
Age                              7
Occupation                      21
City_Category                    3
Stay_In_Current_City_Years       5
Marital_Status                   2
Product_Category_1              20
dtype: int64

In [102]:
df.shape

(783667, 8)

In [106]:
df.isnull().sum()

Gender                        0
Age                           0
Occupation                    0
City_Category                 0
Stay_In_Current_City_Years    0
Marital_Status                0
Product_Category_1            0
dtype: int64

In [104]:
df.head()

Unnamed: 0,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1
0,P00069042,F,0-17,10,A,2,0,3
1,P00248942,F,0-17,10,A,2,0,1
2,P00087842,F,0-17,10,A,2,0,12
3,P00085442,F,0-17,10,A,2,0,12
4,P00285442,M,55+,16,C,4+,0,8


In [92]:
df.dtypes

Gender                        category
Age                           category
Occupation                    category
City_Category                 category
Stay_In_Current_City_Years    category
Marital_Status                category
Product_Category_1            category
dtype: object

In [105]:
#df=df.drop(labels=['prd_maritial'], axis=1)
df=df.drop(labels=['Product_ID'], axis=1)

In [107]:
categorical_cols=['Gender','Age','Stay_In_Current_City_Years','City_Category','Marital_Status','Product_Category_1','Occupation','City_Category']

In [108]:
ord_cols=['Gender','Age','City_Category','Stay_In_Current_City_Years']

In [109]:
from sklearn.preprocessing import OrdinalEncoder
enc=OrdinalEncoder()
df[ord_cols]=enc.fit_transform(df[ord_cols])

In [110]:
for col in df.columns:
  df[col]=df[col].astype('category')

In [111]:
df_frame=np.stack([df[col].cat.codes.values for col in df.columns],1)
df_frame=torch.tensor(df_frame, dtype=torch.int64)
df_frame[:10]

tensor([[ 0,  0, 10,  0,  2,  0,  2],
        [ 0,  0, 10,  0,  2,  0,  0],
        [ 0,  0, 10,  0,  2,  0, 11],
        [ 0,  0, 10,  0,  2,  0, 11],
        [ 1,  6, 16,  2,  4,  0,  7],
        [ 1,  2, 15,  0,  3,  0,  0],
        [ 1,  4,  7,  1,  2,  1,  0],
        [ 1,  4,  7,  1,  2,  1,  0],
        [ 1,  4,  7,  1,  2,  1,  0],
        [ 1,  2, 20,  0,  1,  1,  7]])

In [112]:
output=torch.tensor(y).flatten()
output[:5]

tensor([ 8370, 15200,  1422,  1057,  7969])

In [113]:
print(df.shape)
print(output.shape)

(783667, 7)
torch.Size([550068])


In [114]:
categorical_col_sizes=[len(df[col].cat.categories) for col in categorical_cols]
categorical_emb_sizes=[(col_size, min(50, (col_size+1)//2)) for col_size in categorical_col_sizes]
print(categorical_emb_sizes)

[(2, 1), (7, 4), (5, 3), (3, 2), (2, 1), (20, 10), (21, 11), (3, 2)]


In [243]:
train=df[:550068]
test=df[550068:]

In [123]:
X=train.values
y=np.array(output)
print(X[:5])
print(y[:5])

[[ 0.  0. 10.  0.  2.  0.  3.]
 [ 0.  0. 10.  0.  2.  0.  1.]
 [ 0.  0. 10.  0.  2.  0. 12.]
 [ 0.  0. 10.  0.  2.  0. 12.]
 [ 1.  6. 16.  2.  4.  0.  8.]]
[ 8370 15200  1422  1057  7969]


In [232]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, random_state=21)

In [233]:


X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)

In [234]:
y_train, y_test, y_val = y_train.astype(float), y_test.astype(float), y_val.astype(float)

In [235]:
class RegressionDataset(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

In [236]:
train_dataset = RegressionDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float())
val_dataset = RegressionDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float())
test_dataset = RegressionDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float())

In [237]:
EPOCHS = 100
BATCH_SIZE = 64
LEARNING_RATE = 0.01
NUM_FEATURES = 7

In [245]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=1)
test_loader = DataLoader(dataset=test_dataset, batch_size=1)

In [246]:
class MultipleRegression(nn.Module):
    def __init__(self, num_features):
        super(MultipleRegression, self).__init__()
        
        self.layer_1 = nn.Linear(num_features, 16)
        self.layer_2 = nn.Linear(16, 32)
        self.layer_3 = nn.Linear(32, 16)
        self.layer_out = nn.Linear(16, 1)
        
        self.relu = nn.ReLU()
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.relu(self.layer_2(x))
        x = self.relu(self.layer_3(x))
        x = self.layer_out(x)
        return (x) 

    def predict(self, test_inputs):
        x = self.relu(self.layer_1(test_inputs))
        x = self.relu(self.layer_2(x))
        x = self.relu(self.layer_3(x))
        x = self.layer_out(x)
        return (x)   

In [151]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [247]:
model = MultipleRegression(NUM_FEATURES)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [248]:
from tqdm.notebook import tqdm

In [249]:
loss_stats = {'train': [],"val": []}

In [250]:
print("Begin training.")
for e in tqdm(range(1, EPOCHS+1)):
  train_epoch_loss = 0

  model.train()
  for X_train_batch, y_train_batch in train_loader:
     optimizer.zero_grad()
     y_train_pred = model(X_train_batch)
     train_loss = criterion(y_train_pred, y_train_batch.unsqueeze(1))
     optimizer.zero_grad()
     train_loss.backward()
     optimizer.step()
     train_epoch_loss += train_loss.item()
   
  with torch.no_grad():
      val_epoch_loss = 0
      model.eval()
      for X_val_batch, y_val_batch in val_loader:
        y_val_pred = model(X_val_batch)
        val_loss = criterion(y_val_pred, y_val_batch.unsqueeze(1))
        val_epoch_loss += val_loss.item()
  loss_stats['train'].append(train_epoch_loss/len(train_loader))
  loss_stats['val'].append(val_epoch_loss/len(val_loader))
  print("Train Loss ",(train_epoch_loss/len(train_loader)))
  print("Val Loss ",(val_epoch_loss/len(val_loader)))

Begin training.


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Train Loss  19109667.60575214
Val Loss  18097708.47635017
Train Loss  17941076.62352561
Val Loss  19075679.797356803
Train Loss  17586505.43658103
Val Loss  17084231.58073416
Train Loss  15714064.80933915
Val Loss  14421331.765762378
Train Loss  14294226.49781871
Val Loss  14183968.359853975
Train Loss  13932218.712069802
Val Loss  13900508.770946573
Train Loss  13795224.880675392
Val Loss  13708279.773945682
Train Loss  13662202.200840201
Val Loss  13477484.54135669
Train Loss  13503071.961787041
Val Loss  13989945.540839596
Train Loss  12483525.256584262
Val Loss  12646307.17070401
Train Loss  11592380.803280013
Val Loss  12440166.782577602
Train Loss  11702476.613265472
Val Loss  11297676.561855696
Train Loss  11306257.156083373
Val Loss  11050572.089366635
Train Loss  11221089.862538375
Val Loss  11082730.190541903
Train Loss  11134210.963887542
Val Loss  10897770.835958103
Train Loss  11133160.648327678
Val Loss  11799874.718415737
Train Loss  11204474.057925351
Val Loss  11414340

In [251]:
y_pred_list = []
with torch.no_grad():
    model.eval()
    for X_batch, _ in test_loader:
        y_test_pred = model(X_batch)
        y_pred_list.append(y_test_pred.cpu().numpy())


In [252]:
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]

In [253]:
mse = mean_squared_error(y_test, y_pred_list)
r_square = r2_score(y_test, y_pred_list)

print("Mean Squared Error :",np.sqrt(mse))
print("R^2 :",r_square)

Mean Squared Error : 3018.432935531071
R^2 : 0.6402803469723384


In [254]:
test=np.array(test)

In [255]:
test=torch.from_numpy(test).float()
test

tensor([[ 1.,  4.,  7.,  ...,  2.,  1.,  1.],
        [ 1.,  2., 17.,  ...,  0.,  0.,  3.],
        [ 0.,  3.,  1.,  ...,  4.,  1.,  5.],
        ...,
        [ 0.,  2., 15.,  ...,  4.,  1.,  1.],
        [ 0.,  4.,  1.,  ...,  4.,  0., 10.],
        [ 0.,  4.,  0.,  ...,  4.,  1.,  4.]])

In [256]:
pred_list = []
with torch.no_grad():
    model.eval()
    test_pred = model(test)
    pred_list.append(test_pred.cpu().numpy())

In [257]:
pred_list=[a.squeeze().tolist() for a in pred_list]

In [261]:
subdf=pd.DataFrame(test,columns=['User_ID','Product_ID'])
subdf['Purchase']=np.reshape(pred_list,(-1,1))

In [262]:
subdf.head()

Unnamed: 0,User_ID,Product_ID,Purchase
0,1000004,P00128942,13986.198242
1,1000009,P00113442,10200.819336
2,1000010,P00288442,6457.666504
3,1000010,P00145342,2499.122314
4,1000011,P00053842,2680.040771


In [263]:
subdf.to_csv('subdf.csv', index=False)