In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
pip install emoji google_trans_new thulac gensim

In [None]:
import nltk
import pandas as pd
import csv
import re
import emoji
import pandas as pd
import numpy as np

control_file = 'control_comments_male.txt'
depressed_file = 'depressed_comments_male.txt'

with open(depressed_file, "r", encoding="utf-8") as d:
    d_lines = d.readlines()
    d_lines = [x.replace("\n","") for x in d_lines]

with open(control_file, "r", encoding="utf-8") as c:
    c_lines = c.readlines()
    c_lines = [x.replace("\n","") for x in c_lines]

depressed = pd.DataFrame(d_lines,columns=["Tweet"])
depressed["Depressed"] = 1
control = pd.DataFrame(c_lines,columns=["Tweet"])
control["Depressed"] = 0

df = pd.concat([depressed, control], ignore_index=True)
df = df[df['Tweet'] != '']
df = df[df['Tweet'] != '转发微博']
df = df.sample(frac = 1)
df.reset_index(drop=True, inplace=True)
df.head()



In [None]:
print("Dataset size:",df.shape)
df.groupby("Depressed").count()

In [None]:
from google_trans_new import google_translator as Translator
from joblib import Parallel, delayed
import thulac #https://github.com/thunlp/THULAC-Python

translator = Translator()
chinese_tokenizer = thulac.thulac(seg_only=True)

def remove_noice(tweet):
     tweet = re.sub(r"[^\u4e00-\u9fff]","", tweet.strip())
     return tweet.replace(" ","")

def replace_emoji(tweet):
  emoji_list = emoji.distinct_emoji_list(tweet)

  def translate_emoji(emoji_icon):
    first_token = next(emoji.analyze(emoji_icon))
    return first_token.value.data['zh'] if first_token else ""

  for emoji_icon in emoji_list:
    tweet = tweet.replace(emoji_icon, translate_emoji(emoji_icon))
  return tweet




df["emojilessTweet"] = df["Tweet"].apply(lambda x: replace_emoji(x))
df["noiselessTweet"] = df["emojilessTweet"].apply(lambda x: remove_noice(x))



In [None]:
df.head(50)

In [None]:
# Due to the dataset is large, a lexicon is used to filter the dataframe to extract rows that are highly relevant.

lexicon_df = pd.read_csv("depressionLexiconNew.csv", encoding="utf-8")
print(lexicon_df["context"][lexicon_df["depression_point"]==1].tolist())

In [None]:
depressed_lexicon_keywords = lexicon_df["context"][lexicon_df["depression_point"]==1].tolist()
pattern = re.compile('|'.join(depressed_lexicon_keywords))

filtered_depressed_df = df[df['Depressed'] == 1]
filtered_depressed_df = filtered_depressed_df[filtered_depressed_df["noiselessTweet"].astype(str).apply(lambda x: pattern.search(x) is not None)]
filtered_depressed_df.head(50)

In [None]:
healthy_lexicon_keywords = lexicon_df["context"][lexicon_df["depression_point"]==0].tolist()
pattern = re.compile('|'.join(healthy_lexicon_keywords))

filtered_healthy_df = df[df['Depressed'] == 0]
filtered_healthy_df = filtered_healthy_df[filtered_healthy_df["noiselessTweet"].astype(str).apply(lambda x: pattern.search(x) is not None)]
filtered_healthy_df.head(50)

In [None]:
### Sample dataframe after filtering with lexicon
sampling_df = pd.concat([filtered_depressed_df, filtered_healthy_df], ignore_index=True)
sampling_df = sampling_df.sample(frac = 1)
sampling_df.reset_index(drop=True, inplace=True)
sampling_df.head()


In [None]:
sampling_df.count()

In [None]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words( 'chinese' )
stopwords = [word for word in stopwords if word not in depressed_lexicon_keywords]
stopwords = [word for word in stopwords if word not in healthy_lexicon_keywords]

def remove_stopwords(tweet):
  tweet = chinese_tokenizer.cut(tweet, text=True)
  features = tweet.split(" ")
  tweet = "".join([word for word in features if word not in stopwords])
  return tweet

sampling_df["cleanTweet"] = sampling_df["noiselessTweet"].apply(lambda x: remove_stopwords(x))
sampling_df.head()


In [None]:
sampling_df.head(50)

In [None]:
import gensim

TweetList = df["noiselessTweet"].tolist()
TweetCorpus = []
for tweet in TweetList:
  tweet = chinese_tokenizer.cut(tweet, text=True)
  TweetCorpus.append(tweet.split(" "))

model = gensim.models.Word2Vec(TweetCorpus, window=5, min_count=1, workers=4)
model.save("tweets.model")