In [44]:
import pandas as pd
import numpy as np

In [45]:
ratings = pd.read_csv("Ratings.csv")

In [46]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19942 entries, 0 to 19941
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Original Id  19942 non-null  object
 1   Rating       16506 non-null  object
dtypes: object(2)
memory usage: 311.7+ KB


In [47]:
ratings['Rating'].unique()

array(['G', 'PG', 'PG-13', 'R', '1995', 'Unrated', nan, 'Not Rated',
       'Approved', 'NC-17', 'TV-MA', 'TV-PG', 'Passed', 'GP', 'TV-14',
       '1996', 'M/PG', 'X', '1998', '2000', '1983', '18+', 'M', 'TV-G',
       '1973', '1994–2022', '2002', '1984', '1987', '1971', '1999',
       '2001', '2004', '2003', '2005', '1988', '2006', '1974', '1993',
       '1977', '1967', '16+', '2008', '2007', '2009', '1966', 'Open',
       'TV-13', '2010', '2011', '1997', '1990', '1980', '2013', 'TV-Y',
       '2012', '2014', '1985', '1970', '2015', 'TV-Y7-FV', '1976',
       'TV-Y7', '13+', '1975', '2016', '2017', '2013–2014'], dtype=object)

In [48]:
def is_year(value):
    try:
        int(value)
        return len(str(value)) == 4  # Assuming years are 4 digits
    except (ValueError, TypeError):
        return False

ratings['Rating'] = ratings['Rating'].apply(lambda x: 'Not Rated' if is_year(x) else x)

print(ratings['Rating'].unique())

['G' 'PG' 'PG-13' 'R' 'Not Rated' 'Unrated' nan 'Approved' 'NC-17' 'TV-MA'
 'TV-PG' 'Passed' 'GP' 'TV-14' 'M/PG' 'X' '18+' 'M' 'TV-G' '1994–2022'
 '16+' 'Open' 'TV-13' 'TV-Y' 'TV-Y7-FV' 'TV-Y7' '13+' '2013–2014']


In [49]:
def convert_tv_to_movie_rating(tv_rating):
    tv_to_movie_map = {
        'TV-MA': 'R',
        'TV-PG': 'PG',
        'Passed': 'G',
        'Approved': 'G',
        'GP': 'PG',
        'TV-14': 'PG-13',
        'M/PG': 'PG',
        'X': 'NC-17',
        '18+': 'NC-17',
        'M': 'PG',
        'TV-G': 'G',
        '16+': 'R',  
        'Open': 'G', 
        'TV-13': 'PG-13', 
        'TV-Y': 'G',    
        'TV-Y7-FV': 'G', 
        'TV-Y7': 'G',   
        '13+': 'PG-13'
    }
    return tv_to_movie_map.get(tv_rating, tv_rating)

In [50]:
ratings['Rating'] = ratings['Rating'].apply(convert_tv_to_movie_rating)

print(ratings['Rating'].unique())

['G' 'PG' 'PG-13' 'R' 'Not Rated' 'Unrated' nan 'NC-17' '1994–2022'
 '2013–2014']


In [51]:
values_to_replace = [np.nan, '1994–2022', '2013–2014']
ratings['Rating'] = ratings['Rating'].replace(values_to_replace, 'Not Rated')

print(ratings['Rating'].unique())

['G' 'PG' 'PG-13' 'R' 'Not Rated' 'Unrated' 'NC-17']


In [52]:
print(ratings['Rating'])

0                G
1               PG
2            PG-13
3                R
4               PG
           ...    
19937      Unrated
19938    Not Rated
19939    Not Rated
19940    Not Rated
19941            R
Name: Rating, Length: 19942, dtype: object


In [53]:
movies = pd.read_csv('final_movie_dataset.csv')

In [54]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19942 entries, 0 to 19941
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   movieId            19942 non-null  float64
 1   imdb_id            19942 non-null  object 
 2   original_title     19942 non-null  object 
 3   release_date       19942 non-null  object 
 4   runtime            19942 non-null  float64
 5   title              19942 non-null  object 
 6   budget             19942 non-null  float64
 7   revenue            19942 non-null  float64
 8   popularity         19942 non-null  float64
 9   vote_average       19942 non-null  float64
 10  vote_count         19942 non-null  float64
 11  director           19942 non-null  object 
 12  prod_comp_encoded  19942 non-null  float64
 13  0                  19942 non-null  float64
 14  1                  19942 non-null  float64
 15  2                  19942 non-null  float64
 16  3                  199

In [55]:
ratings.rename(columns={'Original Id':'imdb_id'}, inplace=True)

In [56]:
merged = pd.merge(ratings, movies, on='imdb_id', how='inner')
print("\nInner Join:")
print(merged.head())


Inner Join:
     imdb_id Rating  movieId               original_title release_date  \
0  tt0114709      G    862.0                    Toy Story   1995-10-30   
1  tt0113497     PG   8844.0                      Jumanji   1995-12-15   
2  tt0113228  PG-13  15602.0             Grumpier Old Men   1995-12-22   
3  tt0114885      R  31357.0            Waiting to Exhale   1995-12-22   
4  tt0113041     PG  11862.0  Father of the Bride Part II   1995-02-10   

   runtime                        title      budget      revenue  popularity  \
0     81.0                    Toy Story  30000000.0  373554033.0   21.946943   
1    104.0                      Jumanji  65000000.0  262797249.0   17.015539   
2    101.0             Grumpier Old Men         0.0   71518503.0   11.712900   
3    127.0            Waiting to Exhale  16000000.0   81452156.0    3.859495   
4    106.0  Father of the Bride Part II         0.0   76578911.0    8.387519   

   ...         6         7         8         9        10     

In [58]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19964 entries, 0 to 19963
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   imdb_id            19964 non-null  object 
 1   Rating             19964 non-null  object 
 2   movieId            19964 non-null  float64
 3   original_title     19964 non-null  object 
 4   release_date       19964 non-null  object 
 5   runtime            19964 non-null  float64
 6   title              19964 non-null  object 
 7   budget             19964 non-null  float64
 8   revenue            19964 non-null  float64
 9   popularity         19964 non-null  float64
 10  vote_average       19964 non-null  float64
 11  vote_count         19964 non-null  float64
 12  director           19964 non-null  object 
 13  prod_comp_encoded  19964 non-null  float64
 14  0                  19964 non-null  float64
 15  1                  19964 non-null  float64
 16  2                  199

In [59]:
merged.to_csv('Final_dataset_with_ratings.csv', index=False)