## Import tools

In [None]:
import pandas as pd
import requests
import re
import os
import os.path as path
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np 

## Fetching file

In [None]:
path = "C:/Users/Chris/Desktop/DTU/4. Semester/02466 - Project Work/RedditDataWithLinks_Master.csv"
posts_df = pd.read_csv(path) 

##### Preview samples

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_row', 25)
print(posts_df)


## Clean up data

##### Get list of all column names

In [None]:
colunmNames = posts_df.columns.to_list()
print(colunmNames)

##### Remove unwanted columns

In [None]:
posts_df.drop(columns = ['quarantine', 'link_flair_template_id', 'edited_on', 'call_to_action', 'subreddit_name_prefixed', 'discussion_type', 'post_categories', 'content_categories', 'category', 'can_gild', 'author_flair_template_id', 'from', 'from_kind', 'from_id', 'hidden', 'likes', 'retrieved_utc', 'created', 'report_reasons', 'approved_by', 'saved', 'removal_reason', 'num_reports', 'author_flair_css_class', 'author_flair_text', 'gilded', 'mod_reports', 'user_reports', 'brand_safe', 'contest_mode', 'spoiler', 'suggested_sort', 'author_flair_richtext', 'author_flair_type', 'link_flair_richtext', 'link_flair_text_color', 'link_flair_type', 'rte_mode', 'subreddit_type', 'thumbnail_height', 'thumbnail_width', 'author_flair_background_color', 'author_flair_text_color', 'author_patreon_flair', 'gildings', 'is_robot_indexable', 'link_flair_background_color', 'send_replies', 'no_follow', 'updated_utc', 'all_awardings', 'allow_live_comments', 'author_premium', 'awarders', 'total_awards_received', 'treatment_tags', 'is_created_from_ads_ui', 'parent_whitelist_status', 'pwls', 'url_overridden_by_dest', 'whitelist_status', 'wls', 'removed_by_category', 'approved_at_utc', 'banned_at_utc', 'removed_by', 'top_awarded_type', 'retrieved_on'], inplace = True, errors='ignore')

colunmNames = posts_df.columns.to_list()
print(colunmNames)
print(len(colunmNames))
del colunmNames

##### Change column names and reorder columns

1. Create dictionary - 'old name' : 'new name'

In [None]:
column_names = {'id' : 'PostID',
                'subreddit' : 'Subreddit',
                'subreddit_id' : 'SubredditID',
                'created_utc' : 'PostTime',
                'title' : 'PostTitle',
                'author' : 'Username',
                'author_created_utc' : 'UserCreatedTime',
                'author_fullname' : 'AuthorName', 
                'domain' : 'ImageDomain',
                'full_link' : 'Link',
                'is_self' : 'IsTextPost',
                'media_embed' : 'EmbeddedMedia',
                'secure_media_embed' : 'SecureEmbeddedMedia',
                'num_comments' : 'CommentNumber', 
                'over_18' : 'NSFW',
                'permalink' : 'Permalink', 
                'score' : 'Upvotes', 
                'selftext' : 'PostText', 
                'thumbnail' : 'Thumbnail',
                'url' : 'ImageURL',
                'media' : 'Media',
                'secure_media' : 'SecureMedia',
                'stickied' : 'Stickied',
                'locked' : 'CommentsLocked',
                'post_hint' : 'PostHint',
                'preview' : 'Preview',
                'is_crosspostable' : 'IsCrosspostable',
                'is_reddit_media_domain' : 'IsRedditMediaDomain',
                'is_video' : 'IsVideo',
                'num_crossposts' : 'CrosspostsNumber', 
                'pinned' : 'Pinned',
                'crosspost_parent' : 'CrosspostParent',
                'crosspost_parent_list' : 'CrosspostParentList',
                'is_meta' : 'IsMeta',
                'is_original_content' : 'IsOriginal',
                'media_only' : 'OnlyMedia', 
                'subreddit_subscribers' : 'SubRedditSubscribers',
                'media_metadata' : 'MediaMetadata', 
                'upvote_ratio' : 'UpvoteRatio', 
                'gallery_data' : 'GalleryData', 
                'is_gallery' : 'IsGallery', 
                'author_cakeday' : 'AuthorBirthdate',
                'edited' : 'Edited', 
                'view_count' : 'ViewCount', 
                'author_id' : 'AuthorID',
                'og_description' : 'OGDescription',
                'og_title' : 'OGTitle',
                'utc_datetime_str' : 'TimeString',
                'ups' : 'Ups',
                'downs' : 'Downs',
                'selftext_html' : 'SelfTextHTML',
                'distinguished' : 'Distinguished',
                'link_flair_css_class' : 'LinkFlairClass',
                'link_flair_text' : 'LinkFlairText',
                'archived' : 'Archived',
                'hide_score' : 'IsHideScore'}

2. Rename columns using dictionary

In [None]:
posts_tidy_df = posts_df.rename(columns = column_names)
# Check to see if columns have been renamed
posts_tidy_df.columns 

3. Reorder columns

In [None]:
posts_tidy_df = posts_tidy_df[['Subreddit', 'SubredditID', 'PostTitle', 'PostID', 'TimeString', 'PostTime', 'Username', 'ViewCount', 'Upvotes', 'Ups', 'Downs', 'UpvoteRatio', 'CommentNumber', 'Edited', 'ImageDomain', 'ImageURL', 'Permalink', 'IsTextPost', 'PostText', 'SelfTextHTML', 'UserCreatedTime', 'AuthorName', 'Distinguished', 'LinkFlairClass', 'LinkFlairText', 'AuthorBirthdate', 'IsVideo', 'IsMeta', 'IsOriginal', 'IsRedditMediaDomain', 'IsCrosspostable', 'CrosspostsNumber', 'CrosspostParent', 'CrosspostParentList', 'SubRedditSubscribers', 'OnlyMedia', 'EmbeddedMedia', 'SecureEmbeddedMedia', 'Media', 'SecureMedia', 'Thumbnail', 'Stickied', 'Archived', 'IsHideScore', 'Pinned', 'PostHint', 'Preview', 'CommentsLocked', 'NSFW', 'NewURL']]

In [None]:
print(posts_tidy_df)

#### Fix image URLS

In [None]:
#create column for fixed urls
posts_tidy_df = posts_tidy_df.reindex(columns = posts_tidy_df.columns.tolist() + ['NewURL'])

posts_tidy_df = posts_tidy_df.head(50)

In [None]:
for index, row in posts_tidy_df.iterrows():
    if row['ImageDomain'] == 'flickr.com':
        print(row['ImageURL'])
        r = requests.get(row['ImageURL'])
        soup = bs(r.content)
        images = re.findall(r'(\/\/live\.staticflickr\.com\/[0-9][0-9][0-9][0-9]\/[a-zA-Z0-9_]+\.(?:png|jpg|jpeg|gif|png|svg))', str(soup))
        
        for image in images:
            image_url = image
            break
        print(image_url)
        posts_tidy_df.at[index, 'NewURL'] = image_url
    elif row['ImageDomain'] == 'imgur.com':
        posts_tidy_df.at[index, 'NewURL'] = re.sub(r'http://imgur.com', 'http://i.imgur.com', row['ImageURL']) + '.jpg'
    elif row['ImageDomain'] == 'i.imgur.com':
        posts_tidy_df.at[index, 'NewURL'] = row['ImageURL']
    else:
        continue

# Save the new DataFrame as a CSV file
posts_tidy_df.to_csv("C:/Users/sebas/OneDrive/Dokumenter/skole/4 Semester/Fagprojekt/RedditDataWithLinks.csv", index=False)

## Save data frame and images // START FROM HERE IF YOU HAVE MASTER FILE

#### View data frame

In [None]:
path = "C:/Users/Chris/Desktop/DTU/4. Semester/02466 - Project Work/RedditDataWithLinks_Master.csv"
posts_tidy_df = pd.read_csv(path)

pd.set_option('display.max_columns', None)
posts_tidy_df


#### Cleaning data

In [None]:
## version 3 - with scaling 
print(len(posts_tidy_df))
# Drop rows where Ups or Downs is equal to zero
posts_tidy_df.drop(posts_tidy_df[(posts_tidy_df['Ups'] == 0) | (posts_tidy_df['Downs'] == 0)].index, inplace=True)
# Drop rows if value in Ups is missing or NaN
posts_tidy_df.dropna(subset=['Ups'], inplace=True)
# Calculate UpvoteRatio
posts_tidy_df['UpvoteRatio'] = posts_tidy_df['Ups'] / posts_tidy_df['Downs']

# Calculate the maximum and minimum values of UpvoteRatio
max_ratio = posts_tidy_df['UpvoteRatio'].max()
min_ratio = posts_tidy_df['UpvoteRatio'].min()

# Apply min-max scaling to the UpvoteRatio
posts_tidy_df['ScaledUpvoteRatio'] = (posts_tidy_df['UpvoteRatio'] - min_ratio) / (max_ratio - min_ratio)


#applied scale 
posts_tidy_df['AppliedScale'] = (posts_tidy_df['ScaledUpvoteRatio']*posts_tidy_df['UpvoteRatio'])

# Save the modified DataFrame to a CSV file
filename = "C:/Users/Chris/Desktop/DTU/4. Semester/02466 - Project Work/RedditDataWithLinks4.csv"
posts_tidy_df.to_csv(filename, header=True, index=False, columns=list(posts_tidy_df.axes[1]))



#### Save data frame as CSV


In [None]:
filename = "C:/Users/Chris/Desktop/DTU/4. Semester/02466 - Project Work/RedditDataWithLinks4.csv"
posts_tidy_df.to_csv(filename, header=True, index=False, columns=list(posts_tidy_df.axes[1]))

## Downloading the data

#### Fetching data frame from file

In [None]:
path = "C:/Users/Chris/Desktop/DTU/4. Semester/02466 - Project Work/RedditDataWithLinks4.csv"
posts_tidy_df = pd.read_csv(path)


In [None]:
pd.set_option('display.max_columns', None)
posts_tidy_df



In [None]:
filtered_df = posts_tidy_df.dropna(subset=['NewURL'])
plt.hist((np.log(filtered_df['ScaledUpvoteRatio'])), bins=100)

plt.xlabel('ScaledUpvoteRatio')
plt.ylabel('Frequency')
plt.title('Distribution of ScaledUpvoteRatio')

plt.show()

#### Save images from URLs

In [None]:
#Save images from data frame URL column
root_folder = "C:/Users/sebas/OneDrive/Dokumenter/skole/4 Semester/Fagprojekt/Images/"



def download(row):
   filename = root_folder + row['PostID'] + '.jpg'

   # create folder if it doesn't exist
   os.makedirs(os.path.dirname(filename), exist_ok = True)
   
   try:
        url = row['NewURL']

        if pd.isna(url):
            url = row['ImageURL']
            if pd.isna(url) or not url.startswith('https://i.redd.it/'):
                print(f"Skipping row {row.name} - Missing URL")
                return

            
        
        
        
        if not url.startswith('http://' ) and not url.startswith('https://' ):
            if url.startswith('////'):
                url = 'http://' + url[4:]
            else:
                url = 'http:' + url
        
        

        print(f"Downloading row {row.name} {url} to {filename}")
        r = requests.get(url, allow_redirects=True)

        if len(r.content) < 10240:  # 10kB = 10240 bytes
            print(f"Skipping row {row.name} - Image size is less than 10kB")
            return

        with open(filename, 'wb') as f:
            f.write(r.content)
        
            img = Image.open(filename)
            img = img.resize((224,224))
            img.save(filename)

   except Exception as e:
        print(f"Error occurred while processing row {row.name}: {e}")
        

#test
# posts_tidy_df.apply(download, axis=1)
posts_tidy_df.iloc[0:].apply(download, axis=1)