In [7]:
import pandas as pd       # data manipulation
import numpy as np        # numerical operations
import re                 # regular expressions
from pathlib import Path  # handle filesystem paths


In [8]:
def load_excel(path: Path, filename: str):
    file = path / filename                          # create full path
    if not file.exists():                           # check if file exists
        raise FileNotFoundError(f"File not found: {file}")
    df = pd.read_excel(file, engine="openpyxl")     # read excel file with openpyxl
    print(f"Loaded {filename}: {df.shape[0]} rows, {df.shape[1]} cols")
    return df


In [9]:
def split_and_strip(s):
    if pd.isna(s) or str(s).strip() == '':
        return []
    return [item.strip() for item in str(s).split(',') if item.strip() != '']

base_path = Path(r"C:\\Users\\chris\\Downloads\\")

df = load_excel(base_path, "metaClean43Brightspace.xlsx")  # load movies data

df_genre= df.filter(items=['title', 'genre'])


Loaded metaClean43Brightspace.xlsx: 11364 rows, 13 cols


In [10]:
if 'genre' in df_genre.columns:
    df_genre['genre'] = df_genre['genre'].apply(split_and_strip)
    #print(df_genre['genre'])
    
# ---- Create normalized genre table ----

#explode genre lists into individual rows
norm_genre=df_genre[['genre']].explode('genre').dropna()

#drop duplicates to get unique genres
norm_genre = norm_genre.drop_duplicates().reset_index(drop=True)

#add genre_id
norm_genre['genre_id'] = range(1, len(norm_genre) + 1)

#reorder columns
norm_genre = norm_genre[['genre_id', 'genre']]

#print check
print("Unique genres:", len(norm_genre))
print(norm_genre.head(20))



Unique genres: 27
    genre_id        genre
0          1  Documentary
1          2       Action
2          3       Sci-Fi
3          4        Drama
4          5      Mystery
5          6     Thriller
6          7       Horror
7          8       Comedy
8          9      Romance
9         10    Adventure
10        11      Fantasy
11        12       Family
12        13        Crime
13        14          War
14        15      History
15        16        Sport
16        17    Biography
17        18        Music
18        19         News
19        20      Musical


In [11]:
output_file = 'C:\\Users\\chris\\Downloads\\genre.csv' #makes an output file of the clean data sheet
norm_genre.to_csv(output_file, index=False)

In [12]:
import psycopg2
import pandas as pd
from sqlalchemy import create_engine

In [13]:
conn_string = 'postgresql://postgres:8536@localhost:5432/postgres' #to make a connection with postgres

db = create_engine(conn_string)
conn = db.connect()

conn1 = psycopg2.connect(                     #setup a connection and provide the information about the server (local host)
    database="postgres",
    user="postgres",
    password="8536",
    host="localhost",
    port="5432"
)

conn1.autocommit = True
cursor = conn1.cursor()

In [14]:
cursor.execute('drop table if exists genre') #Dropping the tables that are alreadt exsist

sql = "CREATE TABLE genre (genre_id, genre)"
#creating the table with the columns I want in postgres

In [15]:
data = pd.read_csv("C:\\Users\\chris\\Downloads\\genre.csv") #to import the clean file in postgres

data = data[["genre_id", "genre"]]
#this is to create the dataframe

In [16]:
data.to_sql('genre', conn, if_exists= 'replace')
#to convert the data into sql, if it already exists it will replace the data

27

In [17]:
sql1 = "SELECT * FROM genre"                #to retrieve all the rows from the datafile
cursor.execute(sql1)
rows = cursor.fetchall()
for row in rows:
    print(row)

(0, 1, 'Documentary')
(1, 2, 'Action')
(2, 3, 'Sci-Fi')
(3, 4, 'Drama')
(4, 5, 'Mystery')
(5, 6, 'Thriller')
(6, 7, 'Horror')
(7, 8, 'Comedy')
(8, 9, 'Romance')
(9, 10, 'Adventure')
(10, 11, 'Fantasy')
(11, 12, 'Family')
(12, 13, 'Crime')
(13, 14, 'War')
(14, 15, 'History')
(15, 16, 'Sport')
(16, 17, 'Biography')
(17, 18, 'Music')
(18, 19, 'News')
(19, 20, 'Musical')
(20, 21, 'Western')
(21, 22, 'Animation')
(22, 23, 'Talk-Show')
(23, 24, 'Adult')
(24, 25, 'Short')
(25, 26, 'Reality-TV')
(26, 27, 'Film-Noir')


In [18]:
conn1.commit()
conn1.close()