In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("/kaggle/input/mental-health-social-media/Mental-Health-Twitter.csv", index_col=[0])

# Display the first few rows of the dataset
df.head()

Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label
0,637894677824413696,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1013187241,84,211,251,837,0,1
1,637890384576778240,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1013187241,84,211,251,837,1,1
2,637749345908051968,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1013187241,84,211,251,837,0,1
3,637696421077123073,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1013187241,84,211,251,837,2,1
4,637696327485366272,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1013187241,84,211,251,837,1,1


In [2]:
# Display dataset info
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20000 entries, 0 to 19999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   post_id       20000 non-null  int64 
 1   post_created  20000 non-null  object
 2   post_text     20000 non-null  object
 3   user_id       20000 non-null  int64 
 4   followers     20000 non-null  int64 
 5   friends       20000 non-null  int64 
 6   favourites    20000 non-null  int64 
 7   statuses      20000 non-null  int64 
 8   retweets      20000 non-null  int64 
 9   label         20000 non-null  int64 
dtypes: int64(8), object(2)
memory usage: 1.7+ MB


In [3]:
# Display summary statistics
df.describe()

Unnamed: 0,post_id,user_id,followers,friends,favourites,statuses,retweets,label
count,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0,20000.0
mean,6.874728e+17,3.548623e+16,900.48395,782.42875,6398.23555,44394.42,1437.9273,0.5
std,1.708396e+17,1.606083e+17,1899.913961,1834.817945,8393.072914,140778.5,15119.665118,0.500013
min,3555966000.0,14724380.0,0.0,0.0,0.0,3.0,0.0,0.0
25%,5.931686e+17,324294400.0,177.0,211.0,243.0,5129.0,0.0,0.0
50%,7.6374e+17,1052122000.0,476.0,561.0,2752.0,13251.0,0.0,0.5
75%,8.153124e+17,2285923000.0,1197.0,701.0,8229.0,52892.0,1.0,1.0
max,8.194574e+17,7.631825e+17,28614.0,28514.0,39008.0,1063601.0,839540.0,1.0


In [4]:
# Check for missing values
df.isnull().sum()

post_id         0
post_created    0
post_text       0
user_id         0
followers       0
friends         0
favourites      0
statuses        0
retweets        0
label           0
dtype: int64

In [5]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Function to clean the text
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'#\w+', '', text)  # Remove hashtags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

# Apply the clean_text function to the post_text column
df['cleaned_text'] = df['post_text'].apply(clean_text)

# Handle missing values if any
df = df.dropna()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
df.head()

Unnamed: 0,post_id,post_created,post_text,user_id,followers,friends,favourites,statuses,retweets,label,cleaned_text
0,637894677824413696,Sun Aug 30 07:48:37 +0000 2015,It's just over 2 years since I was diagnosed w...,1013187241,84,211,251,837,0,1,years since diagnosed today im taking moment r...
1,637890384576778240,Sun Aug 30 07:31:33 +0000 2015,"It's Sunday, I need a break, so I'm planning t...",1013187241,84,211,251,837,1,1,sunday need break im planning spend little tim...
2,637749345908051968,Sat Aug 29 22:11:07 +0000 2015,Awake but tired. I need to sleep but my brain ...,1013187241,84,211,251,837,0,1,awake tired need sleep brain ideas
3,637696421077123073,Sat Aug 29 18:40:49 +0000 2015,RT @SewHQ: #Retro bears make perfect gifts and...,1013187241,84,211,251,837,2,1,rt bears make perfect gifts great beginners ge...
4,637696327485366272,Sat Aug 29 18:40:26 +0000 2015,It’s hard to say whether packing lists are mak...,1013187241,84,211,251,837,1,1,hard say whether packing lists making life eas...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Text-based features using TF-IDF
vectorizer = TfidfVectorizer(max_features=500)
text_features = vectorizer.fit_transform(df['cleaned_text']).toarray()

# User-based features
user_features = df[['followers', 'friends', 'favourites', 'statuses', 'retweets']].values

# Combine text and user features
import numpy as np
features = np.hstack((text_features, user_features))

# Display the shape of the extracted features
print(features.shape)

(20000, 505)


In [8]:
features

array([[0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 2.510000e+02,
        8.370000e+02, 0.000000e+00],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 2.510000e+02,
        8.370000e+02, 1.000000e+00],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 2.510000e+02,
        8.370000e+02, 0.000000e+00],
       ...,
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 7.000000e+00,
        1.063601e+06, 0.000000e+00],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 7.000000e+00,
        1.063601e+06, 0.000000e+00],
       [0.000000e+00, 0.000000e+00, 0.000000e+00, ..., 7.000000e+00,
        1.063601e+06, 0.000000e+00]])