In [18]:
import pandas as pd
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import re 
import string

Loading the Data

In [39]:
df = pd.read_csv('data\\tweets.csv')
print(df.columns)
print(df.shape)

print("Count of each unqiue country")
print(df['country'].value_counts())

Index(['user_id', 'created_at', 'tweet_id', 'text', 'country'], dtype='object')
(127333, 5)
Count of each unqiue country
USA            65361
UK             15142
Canada          8985
Nigeria         7476
India           2776
               ...  
Panama            99
Lithuania         99
Morocco           98
Ivory Coast       98
Maldives           2
Name: country, Length: 78, dtype: int64


Data Preprocessing
1. Remove extra characters from tweet text
2. Capitalize all words
3. Encode country to an integer value

In [40]:
#Select number of countries that we want our model to examine (Top in # of tweets)
num_of_top_countries = 10
df = df[df["country"].isin(df["country"].value_counts()[:num_of_top_countries].index.values)]

In [41]:
#removing extra characters
df['text'] = df["text"].map(lambda text: re.sub('[^A-Za-z0-9 ]+',' ', text))

#capitalize all words
df['text'] = df["text"].map(lambda text: string.capwords(text))


# Encode Categorical Features
label_encoder = preprocessing.LabelEncoder()
df['country']= label_encoder.fit_transform(df['country'])
df.head

<bound method NDFrame.head of              user_id                created_at      tweet_id  \
0       1.294095e+18  2022-11-22T22:13:48.000Z  1.595179e+18   
1       1.294095e+18  2022-11-22T20:33:39.000Z  1.595154e+18   
2       1.294095e+18  2022-11-22T19:14:08.000Z  1.595134e+18   
3       1.294095e+18  2022-11-22T16:56:17.000Z  1.595099e+18   
4       1.294095e+18  2022-11-22T15:05:37.000Z  1.595071e+18   
...              ...                       ...           ...   
127228  4.704441e+08  2022-11-23T18:34:17.000Z  1.595486e+18   
127229  4.704441e+08  2022-11-23T18:33:17.000Z  1.595486e+18   
127230  4.704441e+08  2022-11-23T18:33:14.000Z  1.595486e+18   
127231  4.704441e+08  2022-11-23T18:30:05.000Z  1.595485e+18   
127232  4.704441e+08  2022-11-23T17:49:37.000Z  1.595475e+18   

                                                     text  country  
0       Rt Jonnycaplan Creative Entrepreneur Of The Ye...        9  
1       Rt Josemarky 200 Shirtsthtgohard Available Her...      

In [42]:
#divide dataset into data (tweet texts) and target (country)
data = df['text']
target = df.iloc[:,-1]

#split data into 80:20 train and test sets
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=7,stratify=target)

In [43]:
print("Samples per country in training set: {}".format(np.bincount(y_train)))

Samples per country in training set: [ 7188  1118  2221  1818  5981  1676  2148   907 12113 52288]


Apply Default Vectorization and then fit Decision Tree Classifier Model

In [44]:
vectorizer = CountVectorizer().fit(X_train) #fit vectorizer on training
X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [45]:
# print(X_train)
tree = DecisionTreeClassifier(random_state=0).fit(X_train,y_train) #takes 6-7 min

In [46]:
y_pred = tree.predict(X_test)

print("Accuracy:",accuracy_score(y_test, y_pred))
print("MAE:",mean_absolute_error(y_test, y_pred))
print("MSE:",mean_squared_error(y_test, y_pred))
print("RMSE:",mean_squared_error(y_test, y_pred,squared=False))
print("r2 score:",r2_score(y_test, y_pred))

Accuracy: 0.6506288589069289
MAE: 1.541458952664075
MSE: 10.05296135376172
RMSE: 3.170640527363788
r2 score: -0.16109617392318798


Modification

In [34]:
df = pd.read_csv('data\\tweets.csv')
num_of_top_countries = 10
df = df[df["country"].isin(df["country"].value_counts()[:num_of_top_countries].index.values)]

New Preprocessing

In [35]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk
import re
import numpy as np
stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)
#remove rt from begining of sentence - do first cause RT is capitalized.
df["text"] = df["text"].map(lambda name: re.sub('^(RT)', ' ', name))
#removing links
df["text"] = df["text"].map(lambda name: re.sub(r'http\S+', ' ', name))
#removing mentions
df["text"] = df["text"].map(lambda name: re.sub("@([a-zA-Z0-9_]{1,50})", '', name))

#remove repeated instances of characters
#removing repeating characters
repeat_pattern = re.compile(r'(\w)\1*') #compile the pattern we are looking for
match_substitution = r'\1' #substituion pattern
df["text"] = df["text"].map(lambda name: re.sub(repeat_pattern, match_substitution, name))
#removal of digits with regex - we do this here because it is possible to have numbers in tags and urls replace with space.
df["text"] = df["text"].map(lambda name: re.sub(r'[0-9]', ' ', name))

EMOJI_PATTERN = re.compile(
    "["
    "\U0001F1E0-\U0001F1FF"  # flags (iOS)
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
    "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
    "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
    "\U0001FA00-\U0001FA6F"  # Chess Symbols
    "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
    "\U00002702-\U000027B0"  # Dingbats
    "\U000024C2-\U0001F251" 
    "]+")
df["text"] = df["text"].map(lambda name: re.sub(EMOJI_PATTERN, ' ', name))
#do this after removing mentions -> don't # here. ->replace with space.
df["text"] = df["text"].map(lambda name: name.lower())
special_pattern = re.compile('[!\.\^\$\|\?\*\+\=\(\)\{\}\@\=\/\<\>\,\~\`\-\%\&\:\;\[\]"“”…]')
df["text"] = df["text"].map(lambda name: re.sub(special_pattern, ' ', name))
#remove a hashtag if it has no significance, ie, not part of a #word
df["text"] = df["text"].map(lambda name: re.sub('(#[^(a-zA-Z0-9)])', ' ', name))
#removing doublicate spaces and all white spaces like \t, \n or \r
df["text"] = df["text"].map(lambda name: " ".join(name.split()))
#Now remove stop words
df["text"] = df["text"].map(lambda name: ' '.join([word for word in name.split() if word not in stopwords_dict]))
#After removing stop words we can clean up more
df["text"] = df["text"].map(lambda name: re.sub('[\']', ' ', name))
#final white space clean up
df["text"] = df["text"].map(lambda name: " ".join(name.split(' ')))
#still need to check for strings that contain whitespaces only and remove them
df["text"] = df["text"].map(lambda text: np.nan if len(text) == 0 else text)
df.dropna(axis=0, inplace=True)

#capitalize all words
df['text'] = df["text"].map(lambda text: string.capwords(text))

# Encode Categorical Features
label_encoder = preprocessing.LabelEncoder()
df['country']= label_encoder.fit_transform(df['country'])

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

data = df['text']
target = df.iloc[:,-1]

#split data into 80:20 train and test sets
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=7,stratify=target)
print("Samples per country in training set: {}".format(np.bincount(y_train)))
vectorizer = TfidfVectorizer(min_df=5, norm=None).fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

Samples per country in training set: [ 6834  1019  2082  1687  5594  1617  2045   852 11521 48869]


In [37]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier().fit(X_train, y_train) #takes around 50 min

y_pred = etc.predict(X_test)

print("Accuracy:",accuracy_score(y_test, y_pred))
print("MAE:",mean_absolute_error(y_test, y_pred))
print("MSE:",mean_squared_error(y_test, y_pred))
print("RMSE:",mean_squared_error(y_test, y_pred,squared=False))
print("r2 score:",r2_score(y_test, y_pred))

Accuracy: 0.647491475888943
MAE: 1.5805163175840233
MSE: 10.393862640038968
RMSE: 3.2239514016248707
r2 score: -0.19663793911504346
