In [2]:
import pandas as pd
import os

# List of all the CSV filenames (adjust these file names as needed)
csv_files = [
    'action_movies.csv',
    'comedy_movies.csv',
    'drama_movies.csv',
    'adventure_movies.csv',
    'thriller_movies.csv'
]

# Initialize an empty list to hold DataFrames
dfs = []

# Read each CSV file and append it to the list
for file in csv_files:
    df = pd.read_csv(file)
    df['Genre'] = file.split('_')[0]  # Adding a 'Genre' column based on file name
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Optionally, save the combined DataFrame to a CSV file
combined_df.to_csv('megred_movies.csv', index=False)

# Show the first few rows of the combined DataFrame
print(combined_df.head())

                      Title  Rating    Votes Duration   Genre
0          1. Solo Leveling     8.5    (60K)    TV-MA  action
1      2. Kraven the Hunter     5.4    (47K)    2h 7m  action
2  3. The Day of the Jackal     8.2    (88K)    TV-MA  action
3               4. Twisters     6.5   (161K)    2h 2m  action
4           5. Gladiator II     6.5   (219K)   2h 28m  action


In [None]:
import pandas as pd
import numpy as np
import re  # Regular expressions for cleaning text

# Load the CSV file
file_path = "cleaned_megred_movies.csv"  # Update path
df = pd.read_csv(file_path)

# Fix 'Votes' column: Convert values like '60K' to 60000
def clean_votes(value):
    try:
        if isinstance(value, str):  
            value = re.sub(r"[^\dK.]", "", value)  # Remove non-numeric chars except 'K' and '.'
            
            if "K" in value:  # Convert '60K' → 60000
                return int(float(value.replace("K", "")) * 1000)
            
            return int(float(value))  # Convert numeric strings to int
        
        elif pd.isna(value) or value == "":  # Handle missing values
            return np.nan
        
        return int(value)  # Convert valid numbers to int
    
    except Exception as e:
        print(f"⚠️ Error converting Votes: {value} - {e}")
        return np.nan  # Handle conversion errors safely

# Apply the cleaning function
df["Votes"] = df["Votes"].astype(str).apply(clean_votes)

# Fill NaN values with median (if valid data exists)
if df["Votes"].notna().sum() > 0:
    median_votes = int(df["Votes"].dropna().median())  
    df["Votes"] = df["Votes"].fillna(median_votes)
else:
    df["Votes"] = df["Votes"].fillna(0)  # Default to 0 if all are NaN

# Convert column to int64
df["Votes"] = df["Votes"].astype("int64")

# Save cleaned file
cleaned_file_path = "final_megred_movies.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"✅ Data cleaned successfully! File saved as: {cleaned_file_path}")

In [5]:
import pandas as pd
import numpy as np
import re

# Load the CSV file (Update the path as per your system)
file_path = "megred_movies.csv"
df = pd.read_csv(file_path)

# Fix 'Duration' column: Convert '1h 26m' → 86 minutes
def convert_duration(value):
    try:
        if isinstance(value, str):
            hours = re.findall(r'(\d+)h', value)  # Extract hours
            minutes = re.findall(r'(\d+)m', value)  # Extract minutes
            total_minutes = int(hours[0]) * 60 if hours else 0
            total_minutes += int(minutes[0]) if minutes else 0
            return total_minutes
        elif pd.isna(value):  
            return np.nan  # Keep NaN
        return int(value)  # Already in minutes
    except ValueError:
        return np.nan  # Handle errors

df["Duration"] = df["Duration"].apply(convert_duration)
df["Duration"] = df["Duration"].fillna(df["Duration"].median()).astype("int64")  # Fill NaN with median

# Save cleaned file
cleaned_file_path = "cleaned_megred_movies.csv"
df.to_csv(cleaned_file_path, index=False)

print(f"✅ Data cleaned successfully! File saved as: {cleaned_file_path}")


✅ Data cleaned successfully! File saved as: cleaned_megred_movies.csv
