In [13]:
from pprint import pprint
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [14]:
# loading the scraped dataset
df = pd.read_csv("../data/reddit_tech_raw_data.csv")
df.head()

Unnamed: 0,type,post_id,title,timestamp,text,score,total_comments,post_url
0,Post,1njvxsb,"Sinclair Says Kimmel Suspension is Not Enough,...",1758161000.0,,15092,1521,https://sbgi.net/sinclair-says-kimmel-suspensi...
1,comment,1njvxsb,,1758163000.0,"""Kimmel to make a meaningful personal donation...",776,0,
2,comment,1njvxsb,,1758162000.0,This company never addressed [that they had a ...,6337,0,
3,comment,1njvxsb,,1758162000.0,And this is why kneeling to totalitarians to s...,1059,0,
4,comment,1njvxsb,,1758161000.0,>Sinclair will not lift the suspension of “Ji...,12776,0,


In [15]:
df.shape

(14053, 8)

In [None]:
# creating a copy of dataset
df_copy = df.copy()

In [17]:
df_copy.shape

(14053, 8)

In [18]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14053 entries, 0 to 14052
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   type            14053 non-null  object 
 1   post_id         14053 non-null  object 
 2   title           100 non-null    object 
 3   timestamp       14053 non-null  float64
 4   text            13953 non-null  object 
 5   score           14053 non-null  int64  
 6   total_comments  14053 non-null  int64  
 7   post_url        100 non-null    object 
dtypes: float64(1), int64(2), object(5)
memory usage: 878.4+ KB


In [22]:
# fixing the timestamp format
df_copy['timestamp'] = pd.to_datetime(df_copy['timestamp'], unit = 's')
df_copy.head()

Unnamed: 0,type,post_id,title,timestamp,text,score,total_comments,post_url
0,Post,1njvxsb,"Sinclair Says Kimmel Suspension is Not Enough,...",2025-09-18 01:59:34,,15092,1521,https://sbgi.net/sinclair-says-kimmel-suspensi...
1,comment,1njvxsb,,2025-09-18 02:30:59,"""Kimmel to make a meaningful personal donation...",776,0,
2,comment,1njvxsb,,2025-09-18 02:19:32,This company never addressed [that they had a ...,6337,0,
3,comment,1njvxsb,,2025-09-18 02:13:16,And this is why kneeling to totalitarians to s...,1059,0,
4,comment,1njvxsb,,2025-09-18 02:02:23,>Sinclair will not lift the suspension of “Ji...,12776,0,


In [24]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14053 entries, 0 to 14052
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   type            14053 non-null  object        
 1   post_id         14053 non-null  object        
 2   title           100 non-null    object        
 3   timestamp       14053 non-null  datetime64[ns]
 4   text            13953 non-null  object        
 5   score           14053 non-null  int64         
 6   total_comments  14053 non-null  int64         
 7   post_url        100 non-null    object        
dtypes: datetime64[ns](1), int64(2), object(5)
memory usage: 878.4+ KB


In [23]:
# checking for null values
missing_count = df_copy.isnull().sum().sort_values(ascending = False)
missing_count

title             13953
post_url          13953
text                100
type                  0
timestamp             0
post_id               0
score                 0
total_comments        0
dtype: int64

#### About Null Values
- `title`and `post_url`columns are only relevant to `posts` since the majority of rows are comments(over 13000) these columns are empty
- `text` column is mainly for the `comments` like body of the comments. Posts may have text if they are self-posts but most of the posts contain a title and a link so its showing 100 null values

**Action:**
- Filling `NaN` values in `text` and `title` columns with empty strings

In [31]:
# filling NaN values with the empty strings
df_copy['title'] = df_copy['title'].fillna(" ")
df_copy['text'] = df_copy['text'].fillna(" ")
df_copy['post_url'] = df_copy['post_url'].fillna(" ")

In [32]:
# for sentiment analysis creating a new column
df_copy['text_to_analyze'] = np.where(df_copy['type'] == 'post',
                                      df_copy['title'] + " " + df_copy['text'],
                                      df_copy['text'])

df_copy[['type', 'title', 'text', 'text_to_analyze']].head()

Unnamed: 0,type,title,text,text_to_analyze
0,Post,"Sinclair Says Kimmel Suspension is Not Enough,...",,
1,comment,,"""Kimmel to make a meaningful personal donation...","""Kimmel to make a meaningful personal donation..."
2,comment,,This company never addressed [that they had a ...,This company never addressed [that they had a ...
3,comment,,And this is why kneeling to totalitarians to s...,And this is why kneeling to totalitarians to s...
4,comment,,>Sinclair will not lift the suspension of “Ji...,>Sinclair will not lift the suspension of “Ji...


In [34]:
# looking for duplicates
print(f"Number of Duplicate rows: {df_copy.duplicated().sum()}")

Number of Duplicate rows: 0


In [35]:
# checking index values are unique or not
df_copy.index.is_unique

True

In [None]:
# checking for unique values in categorical columns
print(f"Unique Values in 'type' column: {df_copy['type'].unique()}") # expected 2 unique values 'post' and 'column'
print(f"Unique Values in 'post_id' column: {df_copy['post_id'].unique()}") # scraped 100 posts and output displayed 100 unique values

Unique Values in 'type' column: ['Post' 'comment']
Unique Values in 'post_id' column: ['1njvxsb' '1njsge7' '1nk4y7l' '1njzhsz' '1nk6liz' '1njnb5k' '1nk6kf0'
 '1nk71zy' '1nk162x' '1nk6a1k' '1njudys' '1njy1f0' '1nk5b5r' '1njl1e8'
 '1njcwto' '1nk62zw' '1njpx4u' '1njlcqn' '1njnvtb' '1nk71vk' '1njwroc'
 '1nk9n91' '1nj5c7p' '1njg74h' '1njpn8e' '1nj8nga' '1njt634' '1nk645k'
 '1nk7ti9' '1njehcn' '1nj60wb' '1nk6z7g' '1njsf93' '1nk6rud' '1njxzyh'
 '1njfn7m' '1nje623' '1njjfna' '1njb0z8' '1nk7mgk' '1nk4uk7' '1nj91mo'
 '1nj9t1k' '1njfl5h' '1njt7oc' '1nj36t9' '1nimpmu' '1nka3oi' '1njr8gm'
 '1niq8kw' '1nk0k09' '1nk33su' '1nitgbv' '1nj837o' '1nk4yng' '1nk6ku5'
 '1nk3kx5' '1njpfyc' '1nk98wu' '1nj2ijw' '1niravi' '1njzfnf' '1njsh5r'
 '1njipz3' '1njlvuq' '1nijhk7' '1niy6x5' '1nkawoq' '1njd1ud' '1njk8k6'
 '1njz2pa' '1nk9lmz' '1njafr7' '1njn6zy' '1ninh61' '1njmgw1' '1njs7p2'
 '1njohc9' '1nit43o' '1nk6250' '1njhq6y' '1nk5p2e' '1nk5orj' '1njzmd8'
 '1njgm4m' '1nk8uub' '1njhtqq' '1nic3o4' '1njv2rr' '1njefac' '

### Text Preprocessing Step
- converting all the text to lowercase on the `text_to_analyze` column
- removing any urls, emojis, punctuation and special characters
- removing any stop words that appear repeatedly but doesn't add any value to the sentence
- splitting the body of words to tokens and grouping the similar words and reduce the size of the vocabulary by stemming process

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

In [48]:
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# creating a function to clean the text_to_analyze column
def clean_text(text):
  text = text.lower()   # converting all the text to lower case
  text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text)  # removes url, mentions and hashtags
  text = re.sub(r'[^a-z\s]', '', text)  # removes all the punctuation and number only keeps the letters and spaces
  text = re.sub(r'\s+', ' ', text).strip()   # removes the extra whitespaces

  tokens = text.split()   # this breaks text to tokens
  filtered_stemmed_tokens = [
    stemmer.stem(word) for word in tokens if word not in stop_words
  ]
  cleaned_text = ' '.join(filtered_stemmed_tokens)    # this will join the filtered tokens back to single string

  return cleaned_text

df_copy['cleaned_text'] = df_copy['text_to_analyze'].apply(clean_text)

print(df_copy[['text_to_analyze', 'cleaned_text']].head(10))

                                     text_to_analyze  \
0                                                      
1  "Kimmel to make a meaningful personal donation...   
2  This company never addressed [that they had a ...   
3  And this is why kneeling to totalitarians to s...   
4   >Sinclair will not lift the suspension of “Ji...   
5  It. Will. Never. Be. Enough. For. These. Peopl...   
6                Well, Sinclair is a far-right turd.   
7  Note that Jimmy didn't say anything disparagin...   
8  Jimmy Kimmel should walk away; sue Disney for ...   
9  Welp, I will never watch ABC again regardless ...   

                                        cleaned_text  
0                                                     
1  kimmel make meaning person donat kirk famili t...  
2  compani never address chairman abus virginia g...  
3  kneel totalitarian save skin stupid simpli mov...  
4  sinclair lift suspens jimmi kimmel live statio...  
5  never enough peopl appeas authoritarian polit ... 

In [49]:
df_copy.to_csv('../data/reddit_tech_cleaned_data.csv', index = False)

#### moving to next step i.e., EDA on SQL(MySQL)