<a href="https://colab.research.google.com/github/Diyarmo/MSM_predictor/blob/master/MSM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Updating Tensorflow

In [0]:
!pip install tensorflow==2



# Connecting To Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing Libraries

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import keras
from keras.optimizers import Adam, RMSprop
from keras import metrics
from keras import callbacks
from keras import regularizers
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

Using TensorFlow backend.


# Opening and Cleaning "data.csv" and "test.csv" files






In [0]:
def read_and_clean_file(filename):
  data = pd.read_csv(filename)
  data = data.drop(["Unnamed: 0", "Name of show"], axis=1) #"Name of show" and "Episode" are same

  data['Start_time'] = data['Start_time'].str[11:]
  data["Start_time"] = (data["Start_time"].str[0:2].astype(float)) + (data["Start_time"].str[3:5].astype(float))/60
  data["Length"] = data["Length"]/4
  data = data.drop(["End_time"], axis=1)

  data["Year"] = data["Year"].astype(str)
  data["Month"] = data["Date"].str[5:7]
  data["Day"] = data["Date"].str[8:10].astype(np.int8)
  data = data.drop(["Date"], axis=1)

  data = data.drop(["Name of episode"], axis=1) #Not using these feature

  for col in ["First time or rerun", "# of episode in the season", "Movie?", "Game of the Canadiens during episode?"]: 
    data[col] = (data[col] == "Yes").astype(np.int8)

  return data

In [0]:
data = read_and_clean_file("drive/My Drive/data.csv")
test = read_and_clean_file("drive/My Drive/test.csv")

# Filling NA values in data

In [0]:
print(data.isnull().sum())

Episode                                       0
Station                                       0
Channel Type                                  0
Season                                        0
Year                                          0
Day of week                                   0
Start_time                                   43
Length                                        0
Genre                                         0
First time or rerun                           0
# of episode in the season                    0
Movie?                                        0
Game of the Canadiens during episode?         0
Market Share_total                            0
Temperature in Montreal during episode    83344
Month                                         0
Day                                           0
dtype: int64


**Filling missing data in Start time using**

1- Mean of Start time


**Filling missing data in Temperature using**

1- Temperature of that time if found in other rows

2- Mean temperature of that day

In [0]:
def fill_NAs(data):
  data["Start_time"] = data["Start_time"].fillna(np.round(data["Start_time"].mean()*2)/2)

  Temp = pd.DataFrame(data.groupby( ["Year", "Month", "Day", "Start_time"])["Temperature in Montreal during episode"].mean())
  NanIndex = data["Temperature in Montreal during episode"][data["Temperature in Montreal during episode"].isnull()].index
  for i in NanIndex:
    d = data.loc[i]
    data.at[i, "Temperature in Montreal during episode"] = Temp.loc[d["Year"], d["Month"], d["Day"], d["Start_time"]].values[0]

  Temp = pd.DataFrame(data.groupby( ["Year", "Month", "Day"])["Temperature in Montreal during episode"].mean())
  NanIndex = data["Temperature in Montreal during episode"][data["Temperature in Montreal during episode"].isnull()].index
  for i in NanIndex:
    d = data.loc[i]
    data.at[i, "Temperature in Montreal during episode"] = Temp.loc[d["Year"], d["Month"], d["Day"]].values[0]
  return data


In [0]:
data = fill_NAs(data)
test = fill_NAs(test)

In [0]:
print(data.isnull().sum())

Episode                                   0
Station                                   0
Channel Type                              0
Season                                    0
Year                                      0
Day of week                               0
Start_time                                0
Length                                    0
Genre                                     0
First time or rerun                       0
# of episode in the season                0
Movie?                                    0
Game of the Canadiens during episode?     0
Market Share_total                        0
Temperature in Montreal during episode    0
Month                                     0
Day                                       0
dtype: int64


# Encode Episode Column

In [0]:
def get_column_encoder(data, col_name, vocab_size=1000):
  groups = pd.DataFrame(data[col_name]).groupby(col_name).groups
  count = list(map(lambda x: (x, len(groups[x])), groups))
  count = np.array(sorted(count, key = lambda x: x[1], reverse = True))
  encoder = {}
  encoder["OTHER"] = 0
  for i in range(vocab_size):
    encoder[count[i][0]] = i+1
  return encoder

In [0]:
vocab_size = 3000
encoder = get_column_encoder(data, "Episode", vocab_size)

In [0]:
def encode_col(encoder, data, col_name):
  for i in range(data.shape[0]):
    if data[col_name][i] in encoder:
      data.at[i, col_name] = encoder[data[col_name][i]]
    else:
      data.at[i, col_name] = encoder["OTHER"]
  return data

In [0]:
data = encode_col(encoder, data, "Episode")
test = encode_col(encoder, test, "Episode")

# Make dummies for categorical features

In [0]:
data["Episode"] = data["Episode"].astype(int)
test["Episode"] = test["Episode"].astype(int)

In [0]:
ready_data = pd.get_dummies(data[data.select_dtypes(object).columns], drop_first=True).join(data.select_dtypes(np.number))
ready_test = pd.get_dummies(test[test.select_dtypes(object).columns], drop_first=True).join(test.select_dtypes(np.number))

In [0]:
missed_columns_in_test = ready_data.columns[ready_data.columns.isin(ready_test.columns) == False][:-1]
for col in missed_columns_in_test:
  ready_test = ready_test.join(pd.Series(np.zeros(test.shape[0]), name=col).astype(int))
cols = list(ready_data.columns)
del(cols[-3])
ready_test = ready_test[cols]

# Split data

In [0]:
ready_data = ready_data.sample(len(ready_data))

In [0]:
train_data = ready_data[:500000]
valid_data = ready_data[500000:]

In [0]:
train_y = train_data["Market Share_total"].values
train_x_episode = train_data[ "Episode"].values
train_x = train_data.drop(["Market Share_total", "Episode"], axis=1).values

valid_y = valid_data["Market Share_total"].values
valid_x_episode = valid_data[ "Episode"].values
valid_x = valid_data.drop(["Market Share_total", "Episode"], axis=1).values


In [0]:
test_x_episode = ready_test[ "Episode"].values
test_x = ready_test.drop(["Episode"], axis=1).values


# Normalize Data

In [0]:
MU = []
Sigma = []
for i in range(train_x.shape[1]):
  MU.append(train_x.T[i].mean())
  Sigma.append(train_x.T[i].std())
  valid_x.T[i] = (valid_x.T[i] - train_x.T[i].mean())/train_x.T[i].std()
  train_x.T[i] = (train_x.T[i] - train_x.T[i].mean())/train_x.T[i].std()
MU.append(train_y.mean())
Sigma.append(train_y.std())
valid_y = (valid_y - train_y.mean())/train_y.std()
train_y = (train_y - train_y.mean())/train_y.std()


In [0]:
for i in range(test_x.shape[1]):
  test_x.T[i] = (test_x.T[i] - MU[i])/Sigma[i]


# A Naive model: Using Mean

In [0]:
m = train_y.mean()

In [0]:
print("MAE of Using Mean is ", np.mean(np.abs(m - valid_y)))

MAE of Using Mean is  0.5994806806208135


# A Simple model: Using Linear Regression

In [0]:
reg = LinearRegression().fit(train_x, train_y)

In [0]:
print("MAE of linear regression is ", np.mean(np.abs(reg.predict(valid_x) - valid_y)))


MAE of linear regression is  0.3642020818295221


# Neural Network

In [0]:
model1 = tf.keras.models.Sequential([
  tf.keras.layers.Input(shape=(1,)),
  tf.keras.layers.Embedding(vocab_size+1, 16),
  tf.keras.layers.Flatten()
])

model2 = tf.keras.models.Sequential([
  tf.keras.layers.Input(shape=(81,)),
  tf.keras.layers.Dense(120, activation='relu')
])

mergedOut = tf.keras.layers.Concatenate()([model1.output, model2.output])
mergedOut = tf.keras.layers.Flatten()(mergedOut)
mergedOut = tf.keras.layers.Dropout(0.2)(mergedOut)
mergedOut = tf.keras.layers.Dense(120, activation='relu')(mergedOut)
mergedOut = tf.keras.layers.Dropout(0.2)(mergedOut)
mergedOut = tf.keras.layers.Dense(1)(mergedOut)

model = tf.keras.models.Model([model1.input, model2.input], mergedOut)
                              
model.compile(loss='mae',
        optimizer="Adam",
        metrics=[metrics.mae])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 16)        48016       input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 81)]         0                                            
__________________________________________________________________________________________________
flatten (Flatten)               (None, 16)           0           embedding[0][0]                  
______________________________________________________________________________________________

In [0]:
epochs = 50
batch_size = 128
history = model.fit([train_x_episode, train_x], train_y,
    batch_size=batch_size,
    epochs=epochs,
    shuffle=True,
    verbose=1, # Change it to 2, if wished to observe execution
    validation_data=([valid_x_episode, valid_x], valid_y))

Train on 500000 samples, validate on 116656 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [0]:
y_pred = model.predict([valid_x_episode, valid_x])

In [0]:
print("MAE of NN is ", np.mean(np.abs(y_pred.flatten() - valid_y)))

MAE of NN is  0.20351906403968664


In [0]:
print("R2 score of NN is ", r2_score(valid_y, y_pred.flatten()))

R2 score of NN is  0.8782284809241947


# Predict 

In [0]:
y_pred_test = model.predict([test_x_episode, test_x])

In [0]:
pd.DataFrame(y_pred * Sigma[-1] + MU[-1]).to_csv("drive/My Drive/pred_test.csv")