In [1]:
import pandas as pd
import csv
import numpy as np

In [2]:
column_types = {
    'isAdult': float,
    'startYear': float,
    'endYear': float,
    'runtimeMinutes': float,
    'tconst': str,
    'titleType': str,
    'primaryTitle': str,
    'originalTitle': str,
    'genres': str
}

titles_df = pd.read_csv("data.tsv", 
                        dtype=column_types,
                        na_values=r'\N',
                        sep="\t",
                        quoting=csv.QUOTE_NONE)

titles_df = titles_df.dropna(subset=['isAdult','runtimeMinutes','startYear'])

In [3]:
print(titles_df.shape)
titles_df.head()

(2949978, 9)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1.0,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5.0,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0.0,1892.0,,4.0,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0.0,1892.0,,12.0,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1.0,"Comedy,Short"


In [4]:
titles_df['titleType'].unique()

array(['short', 'movie', 'tvShort', 'tvMovie', 'tvSeries', 'tvEpisode',
       'tvMiniSeries', 'video', 'tvSpecial', 'videoGame'], dtype=object)

In [5]:
tv_types = [
    'tvMovie',
    'tvSeries',
    'tvEpisode',
    'tvShort',
    'tvMiniSeries',
    'tvSpecial'
]
titles_df = titles_df.loc[titles_df['titleType'].isin(tv_types)]

In [6]:
titles_df = titles_df.loc[titles_df['primaryTitle'] == titles_df['originalTitle']]
titles_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
28755,tt0029270,tvShort,Much Ado About Nothing,Much Ado About Nothing,0.0,1937.0,,10.0,"Comedy,Romance,Short"
29765,tt0030298,tvMovie,Julius Caesar,Julius Caesar,0.0,1938.0,,101.0,"Drama,History"
34971,tt0035599,tvSeries,Voice of Firestone Televues,Voice of Firestone Televues,0.0,1943.0,1947.0,15.0,
37600,tt0038276,tvSeries,You Are an Artist,You Are an Artist,0.0,1946.0,1955.0,15.0,Talk-Show
38056,tt0038738,tvMovie,A Midsummer Night's Dream,A Midsummer Night's Dream,0.0,1946.0,,150.0,"Drama,Fantasy"
...,...,...,...,...,...,...,...,...,...
10408643,tt9916690,tvEpisode,Horrid Henry Delivers the Milk,Horrid Henry Delivers the Milk,0.0,2012.0,,10.0,"Adventure,Animation,Comedy"
10408644,tt9916692,tvMovie,Teatroteka: Czlowiek bez twarzy,Teatroteka: Czlowiek bez twarzy,0.0,2015.0,,66.0,Drama
10408677,tt9916766,tvEpisode,Episode #10.15,Episode #10.15,0.0,2019.0,,43.0,"Family,Game-Show,Reality-TV"
10408712,tt9916840,tvEpisode,Horrid Henry's Comic Caper,Horrid Henry's Comic Caper,0.0,2014.0,,11.0,"Adventure,Animation,Comedy"


In [7]:
def principal_component_analysis(dataframe, key_list):
    new_dataframe = dataframe[key_list]

    # Compute the mean and std of the data
    mean = np.mean(new_dataframe, axis=0)
    std = np.std(new_dataframe, axis=0)
    # Standardize the data by subtracting the mean and dividing by std
    standardized_data = (new_dataframe - mean) / std
    
    # Compute the covariance matrix
    cov_matrix = np.cov(standardized_data, rowvar=False)

    # Get the eigenvalues and eigenvectors
    eigen_values, eigen_vectors = np.linalg.eig(cov_matrix)

    # Sort the eigenvectors by decreasing eigenvalues
    sorted_index = np.argsort(eigen_values)[::-1]
    sorted_eigenvalue = eigen_values[sorted_index]
    sorted_eigenvectors = eigen_vectors[:, sorted_index]

    # Transform the data
    return np.dot(standardized_data, sorted_eigenvectors)

In [8]:
foo = principal_component_analysis(titles_df,["isAdult",'runtimeMinutes', "startYear"])
foo

array([[-2.24930501, -1.92286087, -2.18237185],
       [-2.85453809,  0.03628494, -2.51660433],
       [-2.07057413, -1.70349914, -1.98260552],
       ...,
       [ 0.44219416,  0.32168503,  0.69178961],
       [ 0.48927506, -0.45459475,  0.63876866],
       [ 0.49631782, -0.47591732,  0.64284475]])

In [9]:
"""
Least Squares
"""
def least_squares_method(dataframe, key1, key2):
    col1 , col2 = dataframe[key1], dataframe[key2]
    mean_x, mean_y = np.mean(col1), np.mean(col2)
    numer, den = 0, 0
    
    for i in range(len(dataframe[key1])):
        numer += (col1.iloc[i] - mean_x) * (col2.iloc[i] - mean_y)
        den += (col1.iloc[i] - mean_x) ** 2

    if den == 0:
        raise ValueError("Denominator is 0.")
        
    m = numer / den
    c = mean_y - (m * mean_x)

    return m,c
 

In [10]:
zoo = least_squares_method(titles_df, "startYear", "runtimeMinutes")
zoo

(-0.03286783674915055, 107.06194578859679)

In [11]:
titles_df.columns

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres'],
      dtype='object')

In [12]:
def least_squares_classification(dataframe, key1, key2, num_classes):
    col1, col2 = dataframe[key1], dataframe[key2]

    # Initialize arrays for coefficients and intercepts
    coefficients = np.zeros((num_classes, 1))  # Each class has a single coefficient
    intercepts = np.zeros(num_classes)

    for class_label in range(num_classes):
        class_indicator = (dataframe['titleType'] == class_label).astype(int)

        # Compute coefficients and intercept for the current class
        mean_x, mean_y = np.mean(col1), np.mean(col2)
        numer, den = 0, 0

        for i in range(len(dataframe[key1])):
            numer += (col1.iloc[i] - mean_x) * (class_indicator.iloc[i] - mean_y)
            den += (col1.iloc[i] - mean_x) ** 2

        if den == 0:
            raise ValueError("Denominator is 0.")

        coefficients[class_label, 0] = numer / den
        intercepts[class_label] = mean_y - (coefficients[class_label, 0] * mean_x)
        print(coefficients, 'and', intercepts)

    # Apply least squares lines to the data for each class
    predictions = np.dot(coefficients, col1.values.reshape(1, -1)) + intercepts.reshape(-1, 1)

    # Determine the predicted class for each data point
    predicted_labels = np.argmax(predictions, axis=0)

    return predicted_labels



In [13]:
num_classes = titles_df['titleType'].nunique()
num_classes

6

In [14]:
predicted_labels = least_squares_classification(titles_df, "runtimeMinutes", "startYear", num_classes)

[[2.13819288e-12]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]] and [2003.11377076    0.            0.            0.            0.
    0.        ]
[[2.13819288e-12]
 [2.13819288e-12]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]] and [2003.11377076 2003.11377076    0.            0.            0.
    0.        ]
[[2.13819288e-12]
 [2.13819288e-12]
 [2.13819288e-12]
 [0.00000000e+00]
 [0.00000000e+00]
 [0.00000000e+00]] and [2003.11377076 2003.11377076 2003.11377076    0.            0.
    0.        ]
[[2.13819288e-12]
 [2.13819288e-12]
 [2.13819288e-12]
 [2.13819288e-12]
 [0.00000000e+00]
 [0.00000000e+00]] and [2003.11377076 2003.11377076 2003.11377076 2003.11377076    0.
    0.        ]
[[2.13819288e-12]
 [2.13819288e-12]
 [2.13819288e-12]
 [2.13819288e-12]
 [2.13819288e-12]
 [0.00000000e+00]] and [2003.11377076 2003.11377076 2003.11377076 2003.11377076 2003.11377076
    0.        ]
[[2.13819288e-12]
 [2.13819288e-

In [15]:
import numpy as np

def least_squares_classification(dataframe, key1, key2, class_key):
    col1, col2, class_labels = dataframe[key1], dataframe[key2], dataframe[class_key]
    
    # Get unique class labels
    classes = np.unique(class_labels)
    
    # Initialize matrices for the least squares problem
    A = np.ones((len(dataframe), 1))  # Bias term
    A = np.hstack((A, col1.values.reshape(-1, 1), col2.values.reshape(-1, 1)))  # Append input features
    B = np.zeros((len(dataframe), len(classes)))

    for i, c in enumerate(classes):
        # Set 1 for rows where the class is c, 0 otherwise
        B[:, i] = (class_labels == c).astype(int)

    # Solve the least squares problem
    coefficients, residuals, _, _ = np.linalg.lstsq(A, B, rcond=None)

    return coefficients[1:], coefficients[0]


In [16]:

m, c = least_squares_classification(titles_df, 'runtimeMinutes', 'isAdult', 'titleType')
print("Coefficients (m):", m)
print("Intercept (c):", c)


Coefficients (m): [[-1.46282867e-03  2.40231038e-04  8.43807262e-04  1.59575135e-04
  -7.60219684e-05  2.95237199e-04]
 [ 1.12478726e-01 -5.58509369e-03 -4.45384996e-02 -4.74524213e-02
  -5.75876004e-03 -9.14395144e-03]]
Intercept (c): [9.23574768e-01 9.56906229e-05 2.04587683e-02 4.72838342e-02
 8.15137147e-03 4.35567770e-04]


In [17]:


# Add the predicted labels to the dataframe
titles_df['predicted_labels'] = predicted_labels

# Display the results
print("Predicted labels:")
print(titles_df[['titleType', 'predicted_labels']])

Predicted labels:
          titleType  predicted_labels
28755       tvShort                 0
29765       tvMovie                 0
34971      tvSeries                 0
37600      tvSeries                 0
38056       tvMovie                 0
...             ...               ...
10408643  tvEpisode                 0
10408644    tvMovie                 0
10408677  tvEpisode                 0
10408712  tvEpisode                 0
10408720  tvEpisode                 0

[1714219 rows x 2 columns]


In [18]:
num_zeros = np.count_nonzero(predicted_labels == 0.0)
print("Number of 0s in Predicted Labels:", num_zeros)

Number of 0s in Predicted Labels: 1714219


In [19]:
"""
Compare with pre-built implementation
"""
start_year = titles_df["startYear"]
minutes = titles_df["runtimeMinutes"]

A = np.vstack([start_year, np.ones(len(start_year))]).T


zoo2 = np.linalg.lstsq( A, minutes, rcond=None)
zoo2[0]

array([-3.28678367e-02,  1.07061946e+02])

In [20]:
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

#print(titles_df[["runtimeMinutes", "isAdult", "startYear", "genres"]])

class TrainDataset(Dataset):
    def __init__(self, data):
        self.data = data
    def __len__(self):
        return self.data.shape[0]
    def __getitem__(self, ind):  
        x = self.data.iloc[ind][["runtimeMinutes", "startYear"]]
        y = self.data.iloc[ind][["isAdult"]]
        # x = np.vstack(x)
        # x = np.vstack(y)
        return x.to_numpy(dtype='double'), y.to_numpy(dtype='longlong').item()


class TestDataset(TrainDataset):
    def __getitem__(self, ind):
        x = self.data.iloc[ind][["runtimeMinutes", "startYear"]]
        return x.to_numpy(dtype='double')


titles_df.index = range(0,len(titles_df))


mini_dfs = np.array_split(titles_df, 2)

train_set = TrainDataset(mini_dfs[0][["isAdult", "runtimeMinutes", "startYear"]]) #TODO: find columns
test_set  = TestDataset(mini_dfs[1][["isAdult", "runtimeMinutes", "startYear"]]) #TODO: select columns

batch_size = 256
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=False)
test_loader  = DataLoader(test_set,  batch_size=batch_size, shuffle=False)

  return bound(*args, **kwds)


In [21]:

device = 'cpu'

class MultiLevelPerceptron(nn.Module):
    def __init__(self):
        super(MultiLevelPerceptron, self).__init__()
        self.linear = nn.Linear(2, 256, bias=False).double()
    def forward(self, x):
        out = self.linear(x)
        return out

model = MultiLevelPerceptron().to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()


In [22]:
epochs = 3

model.train()
for epoch in range(epochs):
    losses = []
    for batch_num, input_data in enumerate(train_loader):
        optimizer.zero_grad()
        x, y = input_data
        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)
        loss.backward()
        losses.append(loss.item())

        optimizer.step()

        if batch_num  % 50 == 49:
            print(f'\tEpoch #{epoch+1} | Batch #{batch_num+1} | Loss: {loss.item()}')

    print(f'Epoch #{epoch+1} | Loss: {sum(losses)/len(losses)}')

	Epoch #1 | Batch #50 | Loss: 2583.4861515404796
	Epoch #1 | Batch #100 | Loss: 2376.2450372583244
	Epoch #1 | Batch #150 | Loss: 2168.1412138654346
	Epoch #1 | Batch #200 | Loss: 1972.3326956338933
	Epoch #1 | Batch #250 | Loss: 1773.7190087218441
	Epoch #1 | Batch #300 | Loss: 1574.4324035614468
	Epoch #1 | Batch #350 | Loss: 1382.7618873625213
	Epoch #1 | Batch #400 | Loss: 1195.073135640432
	Epoch #1 | Batch #450 | Loss: 1011.7130671940905
	Epoch #1 | Batch #500 | Loss: 818.0881739976643
	Epoch #1 | Batch #550 | Loss: 656.2787381972655
	Epoch #1 | Batch #600 | Loss: 459.20409635745756
	Epoch #1 | Batch #650 | Loss: 296.9741742439351
	Epoch #1 | Batch #700 | Loss: 114.07983351049452
	Epoch #1 | Batch #750 | Loss: 0.0006642609706478636
	Epoch #1 | Batch #800 | Loss: 2.8678680891831455e-11
	Epoch #1 | Batch #850 | Loss: 1.830759300203711e-05
	Epoch #1 | Batch #900 | Loss: 1.2472852611811152e-12
	Epoch #1 | Batch #950 | Loss: 5.7734366315937926e-08
	Epoch #1 | Batch #1000 | Loss: 11.04

In [27]:
import csv
model.eval()

with open('mlp_submission.csv', 'w') as f:
    fieldnames = ["runtimeMinutes", "startYear", "isAdult"]
    writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator = '\n')
    writer.writeheader()
    image_id = 1

    with torch.no_grad():
        for x in test_loader:
            x = x.to(device)

            output = model(x).argmax(dim=1)
            for y in output:
                writer.writerow({fieldnames[0]: x, fieldnames[1]: x ,fieldnames[2]: y.item()})
                image_id += 1



TypeError: new(): invalid data type 'str'