In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_sm

In [None]:
# nltk.download('punkt')

In [None]:
import json
import re
import spacy
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# Load data from JSON file

with open('r_anxietyDepression.json', 'r') as f:
    data = json.load(f)

In [4]:
# Initialize lists to store post titles, comments, and authors

post_titles = []
comments = []
authors = []

In [5]:
# Extract relevant information from JSON data

for post in data:
    post_titles.append(post['title'])
    for comment in post['comments']:
        if comment['body'] != "[deleted]":  # Ignore deleted comments
            comments.append(comment['body'])
            authors.append(comment['author'] if comment['author'] else "Unknown")

In [6]:
# Function to clean text data

def clean_text(text):
    text = re.sub(r'\[.*?\]', '', text)                # Remove square brackets and everything within them
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'\n', ' ', text)                    # Remove newline characters
    text = re.sub(r'[^A-Za-z0-9]+', ' ', text)         # Remove special characters except alphanumericand emojis
    text = text.lower()                                # Convert text to lowercase
        
    # Emoticons and emojis can convey sentiment. Replace them with textual representations.
    # Example: 😊 -> "smile"
    # Depending on the emoticons and emojis you want to handle, you can add more replacements here.
    # text = re.sub(r':\)', ' smile ', text)  # Example replacement for :) to "smile"
    # text = re.sub(r':\(', ' frown ', text)  # Example replacement for :( to "frown"
    
    return text

In [7]:
comments_cleaned = [clean_text(comment) for comment in comments]

In [8]:
comments_cleaned

['nice to see people not getting stopped by their mental state and just pursuing what makes then happy anyways congrats ',
 'glowing congrats and you look great ',
 'nawww congratulations you look absolutely stunning ',
 'you look beautiful happy and confident congratulations on your marriage ',
 'i am love with your look classy elegant ',
 'i absolutely love your look it s retro and classy but you add some uniqueness with your shoes you look happy and like you know yourself and are not afraid to be your own person congratulations ',
 'yes yes you were ',
 'congratulations i love your shoes and dress but i especially love your attitude hope you had an amazing day ',
 'you look amazing i wish you and your husband or wife the best of luck',
 'you look great and i love your shoes omg do you have more photos i d love to see ',
 'congratulations you look stunning btw',
 'wow you look so beautiful your dress is perfect ',
 'you sure were congratulations to you ',
 'wow you look absolutely go

#### Stemming

In [9]:
# Initialize the PorterStemmer
stemmer = PorterStemmer()

# Tokenize and stem the text data
def stem_text(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)


# Apply stemming to comments
comments_stemmed = [stem_text(comment) for comment in comments_cleaned]

In [10]:
comments_stemmed

['nice to see peopl not get stop by their mental state and just pursu what make then happi anyway congrat',
 'glow congrat and you look great',
 'nawww congratul you look absolut stun',
 'you look beauti happi and confid congratul on your marriag',
 'i am love with your look classi eleg',
 'i absolut love your look it s retro and classi but you add some uniqu with your shoe you look happi and like you know yourself and are not afraid to be your own person congratul',
 'ye ye you were',
 'congratul i love your shoe and dress but i especi love your attitud hope you had an amaz day',
 'you look amaz i wish you and your husband or wife the best of luck',
 'you look great and i love your shoe omg do you have more photo i d love to see',
 'congratul you look stun btw',
 'wow you look so beauti your dress is perfect',
 'you sure were congratul to you',
 'wow you look absolut gorgeou whoever wa meet you at the altar is certainli lucki to have you in their life i wish onli the best of time for 

#### Lemmatization

In [14]:
# Load spaCy's English tokenizer for lemmatization
nlp = spacy.load("en_core_web_sm")

# Lemmatize the text data
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_tokens)


# Apply lemmatization to comments
comments_lemmatized = [lemmatize_text(comment) for comment in comments_cleaned]

In [15]:
comments_lemmatized

['nice to see people not getting stop by their mental state and just pursue what make then happy anyways congrat',
 'glow congrat and you look great',
 'nawww congratulation you look absolutely stunning',
 'you look beautiful happy and confident congratulation on your marriage',
 'I be love with your look classy elegant',
 'I absolutely love your look it s retro and classy but you add some uniqueness with your shoe you look happy and like you know yourself and be not afraid to be your own person congratulation',
 'yes yes you be',
 'congratulation I love your shoe and dress but I especially love your attitude hope you have an amazing day',
 'you look amazing I wish you and your husband or wife the good of luck',
 'you look great and I love your shoe omg do you have more photo I d love to see',
 'congratulation you look stunning btw',
 'wow you look so beautiful your dress be perfect',
 'you sure be congratulation to you',
 'wow you look absolutely gorgeous whoever be meet you at the al

In [16]:
# Vectorize the text data

vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed

X = vectorizer.fit_transform(comments_lemmatized)

In [18]:
# Print the vectorized output
print(X.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.23167937 0.         0.        ]
 [0.45505406 0.         0.         ... 0.21834693 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.1860261  0.         0.        ]
 [0.         0.         0.         ... 0.         0.24569473 0.        ]
 [0.         0.49712949 0.         ... 0.         0.         0.        ]]
