# Data Pre-processing : Reddit r/developersindia Community Posts Raw

**Problem Statement:** Prepare and clean data for analysis and modeling.]

**Dataset:** https://www.kaggle.com/datasets/avgeekgupta/reddit-rdevelopersindia-community-posts-raw/

**Description:** This file contains steps performed to remove nu;; values, duplicates and column manipulations used to prepare the data.delin.

In [None]:
# Importing libraries

import pandas
import datetime

In [None]:
# Loading the data

data = pandas.read_csv("./data.csv")

data.head(5)

In [None]:
# looking at the columns

data.info(verbose=True)

In [None]:
# Deleting columns which are null
obj = data.isnull().sum()

for key,value in obj.items():
    if value == 2368:
        print("Removed " + key)
        data.drop(key, inplace=True, axis=1)

print("\n\nColumns left: " + str(len(data.columns)))

In [None]:
# Removing columns with only 1 unique value in the column

for column in data.columns:
    if len(data[column].unique()) == 1:
        print("Removed " + column)
        data.drop(column, inplace=True, axis=1)

print("\n\nColumns left: " + str(len(data.columns)))

In [None]:
# Removing columns that contain color or template data

for column in data.columns:
    if "color" in column or "template" in column:
        print("Removed " + column)
        data.drop(column, inplace=True, axis=1)

print("\n\nColumns left: " + str(len(data.columns)))

In [None]:
# Removing columns that contain media data

for column in data.columns:
    if "thumbnail" in column or "preview" in column or "media" in column or "video" in column or "gallery" in column:
        print("Removed " + column)
        data.drop(column, inplace=True, axis=1)

print("\n\nColumns left: " + str(len(data.columns)))

In [None]:
# Removing columns that contain html or richtext data

for column in data.columns:
    if "html" in column or "richtext" in column:
        print("Removed " + column)
        data.drop(column, inplace=True, axis=1)

print("\n\nColumns left: " + str(len(data.columns)))

In [None]:
# Let's what columns are left to work with

data.columns

In [None]:
# Removing unnecessary columns 

columns_to_remove = ['name', 'hide_score', 'is_original_content', 'author_premium', 'post_hint',
                     'link_flair_type', 'author_flair_type', 'domain', 'over_18', 'spoiler',
                     'distinguished', 'id', 'author_patreon_flair', 'permalink', 'url',
                     'subreddit_subscribers', 'created_utc', 'url_overridden_by_dest', 'score',
                     'author_cakeday', 'send_replies', 'stickied', 'edited', 'num_crossposts',
                     'no_follow', 'locked', 'author_fullname'
                    ]

for column in columns_to_remove:
    print("Removed " + column)
    data.drop(column, inplace=True, axis=1)

print("\n\nColumns left: " + str(len(data.columns)))

In [None]:
# Let's what columns are left to work with

data.columns

In [None]:
#converting timestamp to date-time format

data['created'] = data['created'].apply(lambda timestamp : datetime.datetime.fromtimestamp(timestamp))

In [None]:
## Check if the conversion was successfull or not

data['created']

In [None]:
# get month from date
data['month'] = data['created'].dt.month

# get day from date
data['day'] = data['created'].dt.day

# get hour from date
data['hour'] = data['created'].dt.hour

# get minute from date
data['minute'] = data['created'].dt.minute

# get weekday from date
data['weekday'] = data['created'].dt.day_name()

In [None]:
# Let's have a look at the dataset after adding these columns
data.head(5)

In [None]:
# Calculating upvotes and downvotes for posts
data['downs'] = round(data['ups'] / data['upvote_ratio'])

In [None]:
# Let's have a look at the dataset after adding these columns
data.head(5)

In [None]:
# Save the DataFrame to a CSV file
data.to_csv('processed_data.csv', index=False)