In [36]:
import pandas as pd
import chardet
from imdb import IMDb


In [37]:
data = pd.read_csv('/Users/dono/Documents/data_projects/imdb_Top_250_TV_Shows.csv')



In [38]:
# Rename columns to remove spaces and special characters
data.columns = data.columns.str.replace(' ', '_').str.replace(r'[^A-Za-z0-9_]', '')


In [39]:
# Split 'Release_Year' into 'Start_Year' and 'End_Year' and handle ranges
data[['Start_Year', 'End_Year']] = data['Release_Year'].str.split('–', expand=True)


In [40]:
# Remove 'eps' from 'Episodes' and convert to integer
data['Episodes'] = data['Episodes'].str.replace(' eps', '').astype(int)


In [41]:
data.head(20)

Unnamed: 0,Shows_Name,Release_Year,Episodes,Rating,Rating_given_by_people,Start_Year,End_Year
0,Breaking Bad,2008–2013,62,9.5,(2.2M),2008,2013.0
1,Planet Earth II,2016,6,9.5,(163K),2016,
2,Planet Earth,2006,11,9.4,(224K),2006,
3,Band of Brothers,2001,10,9.4,(547K),2001,
4,Chernobyl,2019,5,9.3,(912K),2019,
5,The Wire,2002–2008,60,9.3,(392K),2002,2008.0
6,Avatar: The Last Airbender,2005–2008,62,9.3,(392K),2005,2008.0
7,Blue Planet II,2017,7,9.3,(49K),2017,
8,The Sopranos,1999–2007,86,9.2,(501K),1999,2007.0
9,Cosmos: A Spacetime Odyssey,2014,13,9.2,(132K),2014,


In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Shows_Name              250 non-null    object 
 1   Release_Year            250 non-null    object 
 2   Episodes                250 non-null    int64  
 3   Rating                  250 non-null    float64
 4   Rating_given_by_people  250 non-null    object 
 5   Start_Year              250 non-null    object 
 6   End_Year                208 non-null    object 
dtypes: float64(1), int64(1), object(5)
memory usage: 13.8+ KB


In [43]:
# Ensure the column is a string type
data['Rating_given_by_people'] = data['Rating_given_by_people'].astype(str)


In [44]:
# Remove parentheses
data['Rating_given_by_people'] = data['Rating_given_by_people'].str.replace(r'[()]', '', regex=True)


In [45]:
# Replace 'K' and 'M' with their numerical equivalents
data['Rating_given_by_people'] = data['Rating_given_by_people'].replace({'K': '*1e3', 'M': '*1e6'}, regex=True)


In [46]:
# Evaluate the string as a numerical expression
data['Rating_given_by_people'] = data['Rating_given_by_people'].map(pd.eval, na_action='ignore')


In [47]:
data['Rating_given_by_people'] = data['Rating_given_by_people'].replace('K', 'e3', regex=True)\
                                                                .replace('M', 'e6', regex=True)\
                                                                .replace(r'[()]', '', regex=True)\
                                                                .astype(float)

In [48]:
data.head(10)

Unnamed: 0,Shows_Name,Release_Year,Episodes,Rating,Rating_given_by_people,Start_Year,End_Year
0,Breaking Bad,2008–2013,62,9.5,2200000.0,2008,2013.0
1,Planet Earth II,2016,6,9.5,163000.0,2016,
2,Planet Earth,2006,11,9.4,224000.0,2006,
3,Band of Brothers,2001,10,9.4,547000.0,2001,
4,Chernobyl,2019,5,9.3,912000.0,2019,
5,The Wire,2002–2008,60,9.3,392000.0,2002,2008.0
6,Avatar: The Last Airbender,2005–2008,62,9.3,392000.0,2005,2008.0
7,Blue Planet II,2017,7,9.3,49000.0,2017,
8,The Sopranos,1999–2007,86,9.2,501000.0,1999,2007.0
9,Cosmos: A Spacetime Odyssey,2014,13,9.2,132000.0,2014,


In [49]:
ia = IMDb()


In [50]:
# Defining a function to get genres for a given show title
def get_genres(title):
    try:
        
        search_results = ia.search_movie(title)
        
        if search_results:
            show = search_results[0]
            ia.update(show, info=['main'])
            # Return the genres
            return show.get('genres')
    except Exception as e:
        print(f"Error retrieving information for {title}: {str(e)}")
        return None

In [53]:
# Apply the function to each shows name in the dataset
data['Genres'] = data['Shows_Name'].apply(get_genres)


In [54]:
#checking for nulls and data type
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250 entries, 0 to 249
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Shows_Name              250 non-null    object 
 1   Release_Year            250 non-null    object 
 2   Episodes                250 non-null    int64  
 3   Rating                  250 non-null    float64
 4   Rating_given_by_people  250 non-null    float64
 5   Start_Year              250 non-null    object 
 6   End_Year                208 non-null    object 
 7   Genres                  250 non-null    object 
dtypes: float64(2), int64(1), object(5)
memory usage: 15.8+ KB


In [55]:
#Dropping the Release_Year column since i have start and end year columns now
data = data.drop('Release_Year', axis=1)


In [56]:
df = data.rename(columns={
    'Shows_Name': 'title',
    'Episodes': 'episodes',
    'Rating' : 'rating',
    'Rating_given_by_people': 'ratings_given',
    'Start_Year' : 'start_year',
    'End_Year' : 'end_year',
    'Genres' : 'genres'
    
})

In [58]:
df.head()

Unnamed: 0,title,episodes,rating,ratings_given,start_year,end_year,genres
0,Breaking Bad,62,9.5,2200000.0,2008,2013.0,"[Crime, Drama, Thriller]"
1,Planet Earth II,6,9.5,163000.0,2016,,[Documentary]
2,Planet Earth,11,9.4,224000.0,2006,,[Documentary]
3,Band of Brothers,10,9.4,547000.0,2001,,"[Action, Drama, History, War]"
4,Chernobyl,5,9.3,912000.0,2019,,"[Drama, History, Thriller]"


In [62]:
# Save the cleaned DataFrame back to a CSV file, ensuring UTF-8 encoding
df.to_csv('TopImdbShows.csv', index=False, encoding='utf-8')
