In [3]:
# Import the necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [4]:
# Read the CSV file containing movie data into a Pandas DataFrame
movies = pd.read_csv('mymoviedb.csv', lineterminator='\n')

In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9827 entries, 0 to 9826
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Release_Date       9827 non-null   object 
 1   Title              9827 non-null   object 
 2   Overview           9827 non-null   object 
 3   Popularity         9827 non-null   float64
 4   Vote_Count         9827 non-null   int64  
 5   Vote_Average       9827 non-null   float64
 6   Original_Language  9827 non-null   object 
 7   Genre              9827 non-null   object 
 8   Poster_Url         9827 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 691.1+ KB


In [6]:
movies.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


In [7]:
movies.Release_Date=pd.to_datetime(movies.Release_Date)

In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9827 entries, 0 to 9826
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Release_Date       9827 non-null   datetime64[ns]
 1   Title              9827 non-null   object        
 2   Overview           9827 non-null   object        
 3   Popularity         9827 non-null   float64       
 4   Vote_Count         9827 non-null   int64         
 5   Vote_Average       9827 non-null   float64       
 6   Original_Language  9827 non-null   object        
 7   Genre              9827 non-null   object        
 8   Poster_Url         9827 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 691.1+ KB


In [10]:
movies.Overview=movies.Overview + ' ' + movies.Original_Language + ' ' + movies.Genre

In [15]:
movies.drop(['Original_Language','Genre'],axis=1,inplace=True)

In [16]:
movies.Overview.unique()

array(['Peter Parker is unmasked and no longer able to separate his normal life from the high-stakes of being a super-hero. When he asks for help from Doctor Strange the stakes become even more dangerous, forcing him to discover what it truly means to be Spider-Man. en Action, Adventure, Science Fiction',
       'In his second year of fighting crime, Batman uncovers corruption in Gotham City that connects to his own family while facing a serial killer known as the Riddler. en Crime, Mystery, Thriller',
       'Stranded at a rest stop in the mountains during a blizzard, a recovering addict discovers a kidnapped child hidden in a car belonging to one of the people inside the building which sets her on a terrifying struggle to identify who among them is the kidnapper. en Thriller',
       ...,
       "When young and successful reporter Jamie finds out that her sister has died in mysterious circumstances, she travels to Singapore to uncover the truth. There, she discovers multiple deaths l

In [17]:
tfidf_vectorizer = TfidfVectorizer(min_df=10,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Fit and transform the 'overview' column
overview_encoded = tfidf_vectorizer.fit_transform(movies['Overview'])

# Convert the encoded data to a DataFrame
overview_encoded_df = pd.DataFrame(overview_encoded.toarray(), columns=tfidf_vectorizer.get_feature_names())

# Concatenate the encoded data with the original DataFrame
movies_encoded = pd.concat([movies, overview_encoded_df], axis=1)



In [18]:
movies_encoded.columns

Index(['Release_Date', 'Title', 'Overview', 'Popularity', 'Vote_Count',
       'Vote_Average', 'Poster_Url', '000', '000 years', '1',
       ...
       'zh', 'zh action', 'zh action adventure', 'zh action drama',
       'zh animation', 'zh drama', 'zombie', 'zombies', 'zone', 'zoo'],
      dtype='object', length=5726)

In [19]:
# Scaling values
for col in ['Popularity','Vote_Count']:
    col_values = movies_encoded[col].values.reshape(-1, 1)
    scaler = MinMaxScaler()
    col_scaled = scaler.fit_transform(col_values)

    movies_encoded[col] = col_scaled

In [20]:
movies_encoded.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Poster_Url,000,000 years,1,...,zh,zh action,zh action adventure,zh action drama,zh animation,zh drama,zombie,zombies,zone,zoo
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,1.0,0.287673,8.3,https://image.tmdb.org/t/p/original/1g0dhYtq4i...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",0.752239,0.037037,8.1,https://image.tmdb.org/t/p/original/74xTEgt7R3...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,0.513693,0.003926,6.3,https://image.tmdb.org/t/p/original/vDHsLnOWKl...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",0.471117,0.163336,7.7,https://image.tmdb.org/t/p/original/4j0PNHkMr5...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,0.37119,0.057695,7.0,https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
movies_encoded.to_csv('movies_encoded.csv')