<a href="https://colab.research.google.com/github/Divija2612/kaggle-csv-demo/blob/main/project1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Step 1: Imports
import pandas as pd
import requests
from abc import ABC, abstractmethod

# Step 2: Abstract Factory Reader
class DataReader(ABC):
    @abstractmethod
    def load_data(self):
        pass

# Step 3: GitHub CSV Reader with error handling
class GitHubCSVReader(DataReader):
    def __init__(self, url):
        self.url = url

    def load_data(self):
        print("Fetching CSV from GitHub (handling errors)...")
        try:
            df = pd.read_csv(self.url, on_bad_lines='skip', encoding='utf-8', engine='python')
            print("GitHub CSV loaded successfully.")
            return df
        except Exception as e:
            print(f"Error loading GitHub CSV: {e}")
            return pd.DataFrame()  # return empty if error

# Step 4: Local CSV Reader with error handling
class LocalCSVReader(DataReader):
    def __init__(self, filepath):
        self.filepath = filepath

    def load_data(self):
        print("Reading CSV from local file (handling errors)...")
        try:
            df = pd.read_csv(self.filepath, on_bad_lines='skip', encoding='utf-8', engine='python')
            print("Local CSV loaded successfully.")
            return df
        except Exception as e:
            print(f"Error loading local CSV: {e}")
            return pd.DataFrame()  # return empty if error

# Step 5: Factory Class
class DataReaderFactory:
    @staticmethod
    def get_reader(source_type, source):
        if source_type == "github":
            return GitHubCSVReader(source)
        elif source_type == "local":
            return LocalCSVReader(source)
        else:
            raise ValueError("Invalid source type")

# Step 6: Setup sources
github_url = "https://raw.githubusercontent.com/Divija2612/kaggle-csv-demo/main/github_data.csv"
local_file_path = "/content/local_data.csv"  # this will be updated after upload

# Upload your local CSV file manually in Colab first
from google.colab import files
uploaded = files.upload()

# If the uploaded file is named differently, update the path
import os
for name in uploaded.keys():
    local_file_path = f"/content/{name}"
    print(f"Local file path set to: {local_file_path}")

# Step 7: Load Data using Factory
github_reader = DataReaderFactory.get_reader("github", github_url)
local_reader = DataReaderFactory.get_reader("local", local_file_path)

github_df = github_reader.load_data()
local_df = local_reader.load_data()

# Step 8: Combine and Transform
print("\nCombining DataFrames...")
combined_df = pd.concat([github_df, local_df], ignore_index=True)
# Step 8.5: Remove duplicate headers from rows
# Assumption: if any row has "Age" as value in 'Age' column, it’s a repeated header
combined_df = combined_df[combined_df['Age'] != 'Age']  # Remove header-like rows
combined_df = combined_df.dropna(subset=['Age'])        # Drop rows where Age is missing
combined_df.reset_index(drop=True, inplace=True)        # Reset index


# Optional transformation: drop rows with too many NaNs
combined_df = combined_df.dropna(thresh=int(0.5 * len(combined_df.columns)))

# Print info
print("\nCombined DataFrame Info:")
print(combined_df.info())

# Step 9: Save result
combined_df.to_csv("/content/combined_output.csv", index=False)
print("Combined CSV saved as 'combined_output.csv'")


Saving local_data.csv to local_data (1).csv
Local file path set to: /content/local_data (1).csv
Fetching CSV from GitHub (handling errors)...
GitHub CSV loaded successfully.
Reading CSV from local file (handling errors)...
Local CSV loaded successfully.

Combining DataFrames...

Combined DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9711 entries, 0 to 9710
Data columns (total 9 columns):
 #   Column                                                                                                                                         Non-Null Count  Dtype  
---  ------                                                                                                                                         --------------  -----  
 0   Age	Gender	Avg_Daily_Screen_Time_hr	Primary_Device	Exceeded_Recommended_Limit	Educational_to_Recreational_Ratio	Health_Impacts	Urban_or_Rural  0 non-null      object 
 1   Age                                                                 

In [4]:
from google.colab import files
files.download('combined_output.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
transformed_url = "https://raw.githubusercontent.com/Divija2612/kaggle-csv-demo/main/combined_output.csv"
transformed_df = pd.read_csv(transformed_url)
transformed_df.head()


Unnamed: 0,Age\tGender\tAvg_Daily_Screen_Time_hr\tPrimary_Device\tExceeded_Recommended_Limit\tEducational_to_Recreational_Ratio\tHealth_Impacts\tUrban_or_Rural,Age,Gender,Avg_Daily_Screen_Time_hr,Primary_Device,Exceeded_Recommended_Limit,Educational_to_Recreational_Ratio,Health_Impacts,Urban_or_Rural
0,,14.0,Male,3.99,Smartphone,True,0.42,"Poor Sleep, Eye Strain",Urban
1,,11.0,Female,4.61,Laptop,True,0.3,Poor Sleep,Urban
2,,18.0,Female,3.73,TV,True,0.32,Poor Sleep,Urban
3,,15.0,Female,1.21,Laptop,False,0.39,,Urban
4,,12.0,Female,5.89,Smartphone,True,0.49,"Poor Sleep, Anxiety",Urban
