# Transofrm the dervied data

## Imports

In [139]:
import praw 
import pandas as pd
import matplotlib as m
from dotenv import load_dotenv
from pathlib import Path
import os

## Reading from Bronze Layer

In [None]:
BASE_DIR = Path().resolve().parent

bronze_file_path = BASE_DIR / 'data' / 'bronze' / 'raw_reddit_posts.csv'

if bronze_file_path.exists():
    subreddit_df = pd.read_csv(bronze_file_path)
    print("Path successfully read")
else:
    print(f"Path {bronze_file_path} not found!")

print("Printing extracted data ...")
print(subreddit_df)

## Clean the data

### Check for nulls

In [None]:
if subreddit_df.isnull().values.any():
    # Storing the null cols for display
    null_cols = subreddit_df.columns[subreddit_df.isnull().any()].tolist()

    # Dropping null values
    subreddit_df.dropna(inplace=True)

    print(f"Null columns found: {null_cols}")

else:
    print("No nulls were found!")

### Filter out irrelevant posts

In [None]:
# filter out the posts that have irrelevant words in the title
if subreddit_df['title'].str.contains("questions|question|help|advice|suggestions", case=False,na=False).values.any():
    
    # Storing irrelevant titles for display
    irrelevant_titles = subreddit_df[subreddit_df['title'].str.contains("questions|question|help|advice|suggestions", case=False,na=False)]['title'].tolist()
    
    # Removing data with irrelevant titles
    subreddit_df = subreddit_df[~subreddit_df['title'].str.contains("questions|question|help|advice|suggestions", case=False,na=False)]
    
    print(f"Titles removed: {irrelevant_titles}") 

else:
    print("No irrelevant words found in the title!")

### Drop titles less than 5 character

In [None]:
if (subreddit_df['title'].str.len() < 5).any():

    # Storing irrelevant titles for display
    irrelevant_titles = subreddit_df[subreddit_df['title'].str.len() < 5]['title'].tolist()
    
    # Removing titles that have less than 5 characters
    subreddit_df = subreddit_df[subreddit_df['title'].str.len() >= 5]

    print(f"Titles removed: {irrelevant_titles}")
else:
    print("No titles with less than 5 words found!")

### Drop titles which don't point to Reddit (eg. not starting with http)

In [None]:
if subreddit_df['url'].str.startswith('http://').any():

    # Storing irrelevant urls for display
    irrelevant_urls = subreddit_df[subreddit_df['url'].str.startswith('http://')]['url'].to_list()

    # Removing data that has urls starting with http
    subreddit_df = subreddit_df[~subreddit_df['url'].str.startswith('http://')]
    
    print(f"URLs removed: {irrelevant_urls}")
else:
    print("No urls starting with http found!")

### Remove unwanted characters

## Load to Silver Layer