In [78]:
import torch
import torch.nn as nn

In [135]:
import pandas as pd
import numpy as np
from tqdm import tqdm
device='cuda' if torch.cuda.is_available() else 'cpu'

In [196]:
df = pd.read_csv('./Data/train.csv')

In [183]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612702 entries, 0 to 612701
Data columns (total 4 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   Viewers_ID       612702 non-null  object 
 1   Joke_identifier  612702 non-null  object 
 2   Response_ID      612702 non-null  object 
 3   Rating           612702 non-null  float64
dtypes: float64(1), object(3)
memory usage: 18.7+ MB


In [184]:
df.describe()

Unnamed: 0,Rating
count,612702.0
mean,0.647024
std,2.667301
min,-5.0
25%,-1.3
50%,0.95
75%,2.69
max,5.0


In [185]:
print(f"Number of viewers: {df['Viewers_ID'].nunique()}\nNumber of Content:{df['Joke_identifier'].nunique()}")

Number of viewers: 40863
Number of Content:127


**Number of Ratings if all viewers watched all content:**

In [186]:
 df['Viewers_ID'].nunique()*df['Joke_identifier'].nunique()

5189601

**Number of Ratings we have:**

In [187]:
len(df)

612702

In [188]:
print(f"Percentagewise: {(612702/5189601)*100}% of the data has ratings")

Percentagewise: 11.806341181142828% of the data has ratings


**User to number of comedy Distribution**

In [189]:
df['Viewers_ID'].value_counts()

Viewers_ID
A366      54
A233      54
A367      54
A179      54
A337      54
          ..
A14786     2
A17926     2
A30007     2
A37195     2
A15489     2
Name: count, Length: 40863, dtype: int64

In [190]:
#Sample User
df[df['Viewers_ID']=='A366']

Unnamed: 0,Viewers_ID,Joke_identifier,Response_ID,Rating
451655,A366,Klint De Drunk Enugu 2,A366_Klint De Drunk Enugu 2,-3.83
451656,A366,Klint De Drunk Enugu 3,A366_Klint De Drunk Enugu 3,-4.89
451657,A366,Klint De Drunk PH 1,A366_Klint De Drunk PH 1,-2.39
451658,A366,Klint De Drunk PH 2,A366_Klint De Drunk PH 2,-4.81
451659,A366,Klint De Drunk Lagos 2,A366_Klint De Drunk Lagos 2,0.58
451660,A366,Klint De Drunk Warri 2,A366_Klint De Drunk Warri 2,2.78
451661,A366,AliBaba Lagos 1,A366_AliBaba Lagos 1,3.58
451662,A366,AliBaba Lagos 3,A366_AliBaba Lagos 3,2.59
451663,A366,AliBaba Lagos 4,A366_AliBaba Lagos 4,-4.11
451664,A366,AliBaba Lagos 5,A366_AliBaba Lagos 5,2.17


In [191]:
#Most popular Comedies
df['Joke_identifier'].value_counts()

Joke_identifier
Klint De Drunk PH 2       26311
Klint De Drunk Enugu 2    26286
Klint De Drunk PH 1       26254
Klint De Drunk Enugu 3    26234
Klint De Drunk Lagos 1    25864
                          ...  
Funny Bone Abuja 1            2
Okey Bakassi Abuja 5          2
Okey Bakassi Abuja 1          1
Okey Bakassi Abuja 2          1
Okey Bakassi Lagos 3          1
Name: count, Length: 127, dtype: int64

In [73]:
# Average rating for each user]\
avg_ratings_user={}
avg_ratings_comedy={}

def find_avg_rating_user(user):
    ratings=df[df['Viewers_ID']==user]['Rating'].values
    avg_rating=np.sum(ratings)/len(ratings)
    avg_ratings_user[user]=avg_rating
    
def find_avg_rating_comedy(comedy):
    ratings=df[df['Joke_identifier']==comedy]['Rating'].values
    avg_rating=np.sum(ratings)/len(ratings)
    avg_ratings_comedy[comedy]=avg_rating
    
for index,row in df.iterrows():
    user,comedy=row['Viewers_ID'],row['Joke_identifier']
    if user not in avg_ratings_user.keys():
        find_avg_rating_user(user)
    if comedy not in avg_ratings_comedy.keys():
        find_avg_rating_comedy(comedy)

In [192]:
users_to_i={k:v for k,v in zip(df['Viewers_ID'].unique(),range(len(df['Viewers_ID'].unique())))}
comedy_to_i={k:v for k,v in zip(df['Joke_identifier'].unique(),range(len(df['Joke_identifier'].unique())))}

i_to_users={users_to_i[k]:k for k in users_to_i.keys()}
i_to_comedy={comedy_to_i[k]:k for k in comedy_to_i.keys()}

In [193]:
df['Response_ID']=[x for x in range(len(df))]

In [194]:
df

Unnamed: 0,Viewers_ID,Joke_identifier,Response_ID,Rating
0,A1,Klint De Drunk Enugu 1,0,0.11
1,A1,Klint De Drunk Enugu 2,1,-4.64
2,A1,Klint De Drunk PH 1,2,-3.39
3,A1,Klint De Drunk PH 2,3,0.44
4,A1,Klint De Drunk Lagos 1,4,-4.83
...,...,...,...,...
612697,A9999,Gordons Lagos 2,612697,-0.92
612698,A9999,Gordons Lagos 3,612698,2.44
612699,A9999,Gordons Lagos 4,612699,1.03
612700,A9999,Gordons Abuja 1,612700,1.22


In [197]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

df['scaled_Rating'] = scaler.fit_transform(df[['Rating']])
df

Unnamed: 0,Viewers_ID,Joke_identifier,Response_ID,Rating,scaled_Rating
0,A1,Klint De Drunk Enugu 1,A1_Klint De Drunk Enugu 1,0.11,0.511
1,A1,Klint De Drunk Enugu 2,A1_Klint De Drunk Enugu 2,-4.64,0.036
2,A1,Klint De Drunk PH 1,A1_Klint De Drunk PH 1,-3.39,0.161
3,A1,Klint De Drunk PH 2,A1_Klint De Drunk PH 2,0.44,0.544
4,A1,Klint De Drunk Lagos 1,A1_Klint De Drunk Lagos 1,-4.83,0.017
...,...,...,...,...,...
612697,A9999,Gordons Lagos 2,A9999_Gordons Lagos 2,-0.92,0.408
612698,A9999,Gordons Lagos 3,A9999_Gordons Lagos 3,2.44,0.744
612699,A9999,Gordons Lagos 4,A9999_Gordons Lagos 4,1.03,0.603
612700,A9999,Gordons Abuja 1,A9999_Gordons Abuja 1,1.22,0.622


In [198]:
df.describe()

Unnamed: 0,Rating,scaled_Rating
count,612702.0,612702.0
mean,0.647024,0.564702
std,2.667301,0.26673
min,-5.0,0.0
25%,-1.3,0.37
50%,0.95,0.595
75%,2.69,0.769
max,5.0,1.0


In [200]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,df):
        super(Dataset,self).__init__()
        self.df=df
    def __len__(self):
        return len(self.df)
    def __getitem__(self,idx):
        data=self.df.iloc[idx]
        user=users_to_i[data['Viewers_ID']]
        comedy=comedy_to_i[data['Joke_identifier']]
        rating=data['scaled_Rating']
        return torch.Tensor([user,comedy]),torch.Tensor([rating])

train_idx=int(len(df)*0.8)
val_idx=int(len(df)*0.1)

train_dataset=Dataset(df[:train_idx])
val_dataset=Dataset(df[train_idx:train_idx+val_idx])
test_dataset=Dataset(df[train_idx+val_idx:])

train_loader= torch.utils.data.DataLoader(train_dataset,batch_size=500,shuffle=True)
val_loader= torch.utils.data.DataLoader(val_dataset,batch_size=500,shuffle=True)
test_loader= torch.utils.data.DataLoader(test_dataset,batch_size=500,shuffle=True)

In [96]:
train_dataset.__len__(),val_dataset.__len__(),test_dataset.__len__()

(490161, 61270, 61271)

In [201]:
train_dataset.__getitem__(1001)

(tensor([61.,  2.]), tensor([0.7030]))

In [202]:
class Recommender(nn.Module):
    def __init__(self,no_of_users,no_of_comedy,embed_dim):
        super(Recommender, self).__init__()

        self.user_embeddings= nn.Embedding(no_of_users,embed_dim)
        self.comedy_embeddings= nn.Embedding(no_of_items,embed_dim)

        self.fc1=nn.Linear(embed_dim*2, 200)
        self.fc2=nn.Linear(200, 100)
        self.fc3=nn.Linear(100, 1)
        self.relu=nn.ReLU()
        self.dropout=nn.Dropout(p=0.3)

    def forward(self,x):
        x=torch.cat([self.user_embeddings(x[:,0].long()),self.comedy_embeddings(x[:,1].long())],dim=1)
        out=self.relu(self.fc1(x))
        out=self.dropout(out)
        out=self.relu(self.fc2(out))
        out=self.dropout(out)
        out=self.relu(self.fc3(out))
        return out

no_of_users,no_of_comedy=len(users_to_i),len(comedy_to_i)
embed_dim=20

model=Recommender(no_of_users,no_of_comedy,embed_dim)
model=model.to(device)

In [203]:
model

Recommender(
  (user_embeddings): Embedding(40863, 20)
  (comedy_embeddings): Embedding(127, 20)
  (fc1): Linear(in_features=40, out_features=200, bias=True)
  (fc2): Linear(in_features=200, out_features=100, bias=True)
  (fc3): Linear(in_features=100, out_features=1, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.3, inplace=False)
)

In [204]:
criterion = nn.MSELoss()
def train(epochs,model,dir):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    model_path='model.pt'
    best_val_loss = float('inf')
    best_train_loss = float('inf')

    for epoch in range(epochs):
        total_loss = 0
        total_val_loss = 0
        print('Training..')
        model.train()
        for batch in tqdm(train_loader):
            inputs,labels = batch
            inputs,labels=inputs.to(device),labels.to(device)
            out = model.forward(inputs)
            optimizer.zero_grad()
            loss = criterion(out,labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        total_loss /= len(train_loader)
        print('Validating..')
        model.eval()
        for val_batch in tqdm(val_loader):
            with torch.no_grad():
                val_inputs,val_labels = val_batch
                val_inputs,val_labels=val_inputs.to(device),val_labels.to(device)
                val_out = model.forward(val_inputs)
                val_loss = criterion(val_out, val_labels)
                total_val_loss += val_loss.item()

        total_val_loss /= len(val_loader)

        if total_val_loss < best_val_loss:
            print('re-assigning best loss')
            best_val_loss = total_val_loss
            best_train_loss=total_loss
            print('storing best model')
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss
            }, f"{dir}/{model_path}")
            
            #for Inference
            model_scripted = torch.jit.script(model)
            model_scripted.save(f"{dir}/model_scripted.pt")

        print(f"Epoch: {epoch + 1} | Train Loss: {total_loss} | Validation Loss: {total_val_loss}")

    return {
        'Best_training_loss':best_train_loss,
        'Best_validation_loss': best_val_loss
        }

In [205]:
train(5,model,'./')

Training..


100%|██████████| 981/981 [01:22<00:00, 11.85it/s]


Validating..


100%|██████████| 123/123 [00:12<00:00,  9.92it/s]


re-assigning best loss
storing best model
Epoch: 1 | Train Loss: 0.06810411677204992 | Validation Loss: 0.06674015349367769
Training..


100%|██████████| 981/981 [01:49<00:00,  9.00it/s]


Validating..


100%|██████████| 123/123 [00:12<00:00, 10.09it/s]


Epoch: 2 | Train Loss: 0.062331985150607234 | Validation Loss: 0.06724329047450205
Training..


100%|██████████| 981/981 [01:48<00:00,  9.08it/s]


Validating..


100%|██████████| 123/123 [00:11<00:00, 10.40it/s]


Epoch: 3 | Train Loss: 0.05808032561894829 | Validation Loss: 0.07036251274914276
Training..


100%|██████████| 981/981 [01:53<00:00,  8.63it/s]


Validating..


100%|██████████| 123/123 [00:13<00:00,  9.10it/s]


Epoch: 4 | Train Loss: 0.05250259587934258 | Validation Loss: 0.07213776173993824
Training..


100%|██████████| 981/981 [01:58<00:00,  8.27it/s]


Validating..


100%|██████████| 123/123 [00:14<00:00,  8.63it/s]


Epoch: 5 | Train Loss: 0.04876303896080099 | Validation Loss: 0.07392782911779434


{'Best_training_loss': 0.06810411677204992,
 'Best_validation_loss': 0.06674015349367769}

In [206]:
def load_model(dir,mode='inference'):
    if mode=='training':
        model=Recommender(no_of_users,no_of_comedy,embed_dim)
        checkpoint = torch.load('./model.pt')
        model.load_state_dict(checkpoint['model_state_dict'])
    else:
        model = torch.jit.load('model_scripted.pt')
    model.eval()
    return model

In [494]:
model=load_model(dir='./',mode='inference')

total_test_loss=0
for test_batch in tqdm(test_loader):
            with torch.no_grad():
                test_inputs,test_labels = test_batch
                test_inputs,test_labels=test_inputs.to(device),test_labels.to(device)
                test_out = model.forward(test_inputs)
                test_loss = criterion(test_out, test_labels)
                total_test_loss += test_loss.item()

total_test_loss /= len(val_loader)
print(f"Test Loss: {total_test_loss}")

100%|██████████| 123/123 [00:08<00:00, 14.07it/s]


Test Loss: 0.06913939854357301


In [None]:
Test Model

In [292]:
df.iloc[0],train_dataset.__getitem__(0)

(Viewers_ID                                A1
 Joke_identifier       Klint De Drunk Enugu 1
 Response_ID        A1_Klint De Drunk Enugu 1
 Rating                                  0.11
 scaled_Rating                          0.511
 Name: 0, dtype: object,
 (tensor([0., 0.]), tensor([0.5110])))

In [539]:
def predict_rating(user,comedy):
    user=users_to_i[user]
    comedy=comedy_to_i[comedy]
    input=torch.Tensor([user,comedy]).view(1,-1)
    output=(model(input)).detach().numpy()
    predicted_rating=scaler.inverse_transform(output)[0][0]
    return predicted_rating

In [508]:
user_idx=np.random.randint(len(users_to_i))
comedy_idx=np.random.randint(len(comedy_to_i))

user_id=df.iloc[user_idx]['Viewers_ID']
comedy_id=df.iloc[comedy_idx]['Joke_identifier']
print(f"User: {user_id}\nComedy: {comedy_id}")

available_rating=df[(df['Viewers_ID']==user_id) & (df['Joke_identifier']==comedy_id)]
if len(available_rating)>0:
    print(f"Rating Given: {available_rating['Rating'].values[0]}\n")
else:
    print(f"No rating Given as User Has not seen this comedy Yet!!\n")

predict_rating(user,comedy)

User: A11593
Comedy: Bovi Warri 3
No rating Given as User Has not seen this comedy Yet!!



-0.3141892

In [541]:
test_df=pd.read_csv('./Data/SampleSubmission.csv')

In [543]:
predictions=[]

def create_pred(text):
    try:
        response=text.split('_')
        user=response[0]
        comedy=response[1]
        rating=predict_rating(user,comedy)
    except:
        rating=0
    return rating
    
test_df['Rating']=test_df['Response_ID'].apply(create_pred)
test_df

Unnamed: 0.1,Unnamed: 0,Response_ID,Rating
0,0,A1_Akpororo Lagos 1,1.935296
1,1,A1_Akpororo Abuja 1,1.537716
2,2,A1_Akpororo Abuja 5,1.797838
3,3,A1_I Go Dye Benin 1,1.721010
4,4,A1_I Go Dye Benin 3,1.053936
...,...,...,...
435868,435868,A9999_Funny Bone Lagos 1,0.387867
435869,435869,A9999_Okey Bakassi Lagos 2,0.063205
435870,435870,A9999_Okey Bakassi Abuja 3,0.000000
435871,435871,A9999_MisterIbu Lagos 1,0.000000


In [544]:
len(test_df[test_df['Rating']==0])

73922

In [545]:
test_df.to_csv('./Data/SampleSubmission.csv')