# Import Libraries

In [1]:
import json
import pandas as pd
import emoji
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report


# Retrieve Data

In [2]:
f1 = open('tweets.json')
data_tweets = json.load(f1).items()
user_label = pd.read_csv('labeled_users_clean.csv')


In [3]:
# Rename column for merging
user_label.rename(columns={"screen_name": "id"}, inplace=True)
print(user_label.head())
print(len(user_label))


   Unnamed: 0  pic_id               id       user_id lang            location  \
0           0       0    _____zac_____  4.614412e+08   en       Maryland, USA   
1           1       1         ___aleia  7.650000e+17   en           Ohio, USA   
2           2       3  ___schaeffer___  1.257110e+09   en             The Lou   
3           3       9    __EmilyRice__  3.797155e+09   en    Marble Falls, TX   
4           4      10      __ginaaaa__  1.941566e+09   en  West Virginia, USA   

   protected  followers_count  friends_count  statuses_count  \
0          0         0.000419       0.002298        0.060043   
1          0         0.000948       0.005010        0.007060   
2          0         0.001656       0.008522        0.057368   
3          0         0.000316       0.002453        0.002550   
4          0         0.001410       0.005785        0.029273   

   favourites_count  account_created_at  verified  \
0          0.010963            0.361569       0.0   
1          0.061531   

In [4]:
data_tweets = pd.DataFrame.from_dict(data_tweets)
data_tweets.rename(columns={0: "id", 1: "tweets"}, inplace=True)
print(data_tweets[:5])
print(len(data_tweets))


                id                                             tweets
0    _____zac_____  [@AdvoBarryRoux @GetVidBot, The owner of drip ...
1         ___aleia  [I haven‚Äôt talked to this girl since my sophom...
2          ___Dals  [It come wit it üò≠ https://t.co/ENyEtlphtP, @na...
3  ___schaeffer___  [‚òùüèºüëãüèº https://t.co/xGJLlzLR7g, https://t.co/7N...
4   __andresiscool  [Enough https://t.co/gLarVLIHxW, Some of the w...
2678


In [5]:
# CSV data after cleaning and selection
alldata = pd.merge(data_tweets, user_label, on='id')
print(len(alldata))

980


In [16]:
# Extract 3 useful columns ID, tweets, age
idAndTweets = alldata[["id","tweets","age"]]

# Clean data - remove mentions, urls, hashtags, emoji, punctuations, special chars, stop words

In [17]:
#remove mentions, urls, hashtags.
regexMap = {r"@[\w]+": "", r"http[\S]+": "", r"#[\w]+": ""}

def cleaning(data):
    t = data
    for regx in regexMap.keys():
        t = re.sub(regx, regexMap[regx], str(t))
    return t

#remove emojis
def deEmojify(data):
    t = data
    return emoji.get_emoji_regexp().sub('', str(t))


idAndTweets["tweets"] = idAndTweets["tweets"].apply(cleaning)
idAndTweets["tweets"] = idAndTweets["tweets"].apply(deEmojify)
idAndTweets.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,id,tweets,age
0,_____zac_____,"[' ', ""The owner of drip doesn't even have 100...",1.0
1,___aleia,['I haven‚Äôt talked to this girl since my sopho...,0.0
2,___schaeffer___,"[' ' ' ' 37-14-9', ' congrats sis keep work...",0.0
3,__EmilyRice__,[' yes but come to san marcos and live with me...,0.0
4,__ginaaaa__,"[' small :)', 'Go get ready for dinner. ' JA...",0.0


In [18]:
# removing punctuations, numbers, and special characters
idAndTweets['tweets'] = idAndTweets['tweets'].str.replace("[^a-zA-Z#]", " ")
idAndTweets.head(10)


  idAndTweets['tweets'] = idAndTweets['tweets'].str.replace("[^a-zA-Z#]", " ")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,id,tweets,age
0,_____zac_____,The owner of drip doesn t even have ...,1.0
1,___aleia,I haven t talked to this girl since my sopho...,0.0
2,___schaeffer___,congrats sis keep work...,0.0
3,__EmilyRice__,yes but come to san marcos and live with me...,0.0
4,__ginaaaa__,small Go get ready for dinner JA...,0.0
5,__masonsmith__,pain b g ten basketball is dead ...,0.0
6,__sammybear__,sugar daddies or Venmo send me money pl...,0.0
7,_AJoseph_,What a dumb ass petty islanders tweet ...,0.0
8,_ashleelyon,I just registered to save a life with You ...,0.0
9,_ashleyshaffer,The All American soundtrack is pretty fuckin...,0.0


In [19]:
#remove stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
idAndTweets['tweets'] = idAndTweets['tweets'].apply(
    lambda x: ' '.join([w for w in x.split() if w not in stop_words]))
idAndTweets.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/francischen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


Unnamed: 0,id,tweets,age
0,_____zac_____,The owner drip even mill drip company aint got...,1.0
1,___aleia,I talked girl since sophomore year I slid stor...,0.0
2,___schaeffer___,congrats sis keep workin fs spending memorial ...,0.0
3,__EmilyRice__,yes come san marcos live see next week square ...,0.0
4,__ginaaaa__,small Go get ready dinner JACK related I need ...,0.0


In [20]:
print(idAndTweets.head())
print(idAndTweets.size)

                id                                             tweets  age
0    _____zac_____  The owner drip even mill drip company aint got...  1.0
1         ___aleia  I talked girl since sophomore year I slid stor...  0.0
2  ___schaeffer___  congrats sis keep workin fs spending memorial ...  0.0
3    __EmilyRice__  yes come san marcos live see next week square ...  0.0
4      __ginaaaa__  small Go get ready dinner JACK related I need ...  0.0
2940


# TF-IDF vectorizer & Logistic Regression, Oversampling, 5-fold cross validation

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
tfidf = TfidfVectorizer(max_features=7500, ngram_range=(1, 2))
X = tfidf.fit_transform(idAndTweets['tweets'])
y = idAndTweets['age']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=1/4.0, random_state=0)
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)


In [22]:
# K-fold (5-fold)
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
tfidf = TfidfVectorizer(max_features=7500, ngram_range=(1, 2))

X = tfidf.fit_transform(idAndTweets['tweets'])
y = idAndTweets['age']

kf = KFold(n_splits=5)
kf.get_n_splits(X)

for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# Oversampling
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)


In [23]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
log = LogisticRegression(multi_class='multinomial', solver='lbfgs').fit(X_train, y_train)
y_pred = log.predict(X_test)


In [24]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

         0.0       0.77      0.91      0.83       129
         1.0       0.74      0.46      0.57        67

    accuracy                           0.76       196
   macro avg       0.75      0.69      0.70       196
weighted avg       0.76      0.76      0.74       196



# Complement Naive Bayes

In [25]:
import nltk
from nltk.tokenize import RegexpTokenizer

training_dataset = idAndTweets.sample(frac=0.7, ignore_index=True)
training_dataset.head()
test_dataset = idAndTweets.drop(training_dataset.index)

token = RegexpTokenizer(r'[a-zA-Z0-9]+')
vec = CountVectorizer(stop_words='english', ngram_range=(1, 3), tokenizer=token. tokenize)

x_train = vec.fit_transform(training_dataset['tweets'].values)
y_train = training_dataset['age']
x_test = vec.transform(test_dataset['tweets'])
y_test= test_dataset['age']


In [26]:
from sklearn.naive_bayes import ComplementNB
model = ComplementNB()
model.fit(x_train, y_train)


ComplementNB()

In [27]:
model.score(x_train,y_train)

0.9985422740524781

In [28]:
model.score(x_test,y_test)

0.9251700680272109

In [29]:
y_prediction = model.predict(x_test)
print(classification_report(y_test,y_prediction))

              precision    recall  f1-score   support

         0.0       0.90      1.00      0.95       202
         1.0       1.00      0.76      0.86        92

    accuracy                           0.93       294
   macro avg       0.95      0.88      0.91       294
weighted avg       0.93      0.93      0.92       294

