In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/entire-data/all_data.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/sample_submission.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/movies_keywords.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/movies_metadata.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/train.csv
/kaggle/input/breakthrough-tech-ai-studio-challenge/test.csv
/kaggle/input/mergedata/merge_data.ipynb


## Train Dataset

In [9]:
train = pd.read_csv("/kaggle/input/breakthrough-tech-ai-studio-challenge/train.csv")
train.head()

Unnamed: 0,userId_movieId,rating
0,10_1358,0.4
1,237_1544,0.7
2,54_373,1.0
3,11_2053,0.8
4,183_2524,0.6


In [10]:
#split "userId_movieId" column to two 'userId', 'movieId'
train[['userId', 'movieId']] = train["userId_movieId"].apply(lambda x: pd.Series(str(x).split("_")))
train = train.drop('userId_movieId', axis=1)
train.head()

Unnamed: 0,rating,userId,movieId
0,0.4,10,1358
1,0.7,237,1544
2,1.0,54,373
3,0.8,11,2053
4,0.6,183,2524


In [11]:
#change the datatype of the id columns from object to integers
train = train.astype({"userId":"int","movieId":"int"})
print(train)

       rating  userId  movieId
0         0.4      10     1358
1         0.7     237     1544
2         1.0      54      373
3         0.8      11     2053
4         0.6     183     2524
...       ...     ...      ...
69997     0.7     308      356
69998     0.6     500      223
69999     1.0     617     2722
70000     0.8     305    45987
70001     0.8     305     2130

[70002 rows x 3 columns]


In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70002 entries, 0 to 70001
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   rating   70002 non-null  float64
 1   userId   70002 non-null  int64  
 2   movieId  70002 non-null  int64  
dtypes: float64(1), int64(2)
memory usage: 1.6 MB


## Principal Component Analysis - PCA 

Principal Component Analysis is a powerful technique for reducing the dimensionality of complex datasets by finding the directions of maximum variance in the data and projecting the data onto these directions. It is widely used in data analysis and machine learning for simplifying complex datasets and improving the accuracy and efficiency of machine learning models. PCA is used for dimensionality reduction because it can help simplify complex datasets and remove noise and redundancy. By reducing the number of dimensions in the data, we can reduce the computational complexity of machine learning algorithms, speed up the training process, and improve the accuracy of the model.

**MinMaxScaler**

In [None]:
data = pd.read_csv("/kaggle/input/final-movie-metadata/final_movie_metadata.csv")
data.head()

In [13]:
# first find the number of components using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_rescaled = scaler.fit_transform(train)

In [14]:
# 95% or 99% of variance 
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)
pca.fit(data_rescaled)
data_reduced = pca.transform(data_rescaled)

In [15]:
print(data_reduced)

[[ 0.49293871  0.35349069 -0.05155652]
 [ 0.16259121  0.01183365 -0.05810513]
 [ 0.44352109 -0.31449153 -0.03926992]
 ...
 [-0.3960323  -0.33545155 -0.07117036]
 [ 0.04879994 -0.09323984  0.21048714]
 [ 0.06371889 -0.10165986 -0.05646931]]


In [19]:
target = train['rating']
print(target)

0        0.4
1        0.7
2        1.0
3        0.8
4        0.6
        ... 
69997    0.7
69998    0.6
69999    1.0
70000    0.8
70001    0.8
Name: rating, Length: 70002, dtype: float64


****RandomForestRegressor Model****

Step 1: Load the data and split into training and testing sets

In [20]:
from sklearn.model_selection import train_test_split

data = data_rescaled
X_train, X_test, y_train, y_test = train_test_split(data.data, target, test_size=0.3, random_state=42)


TypeError: only integer scalar arrays can be converted to a scalar index

Step 2. Perform PCA on training data

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)


Step 3: Train the Random Forest Regressor model on the transformed training data

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train_pca, y_train)

Step 4: Transform the testing data using the same PCA transformation and evaluate the model

In [None]:
X_test_pca = pca.transform(X_test)
score = rf.score(X_test_pca, y_test)
print("R^2 Score: {:.2f}".format(score))