### FIT5149 - Applied Data Analysis - Assignment 2

### IMPORT PACKAGES

In [259]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk import word_tokenize    
from nltk.tokenize import wordpunct_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB,BernoulliNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
import xml.etree.ElementTree as ET
from sklearn import preprocessing
import re, string
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### PREPARING THE DATA

In [265]:
train_labels = pd.read_csv("train_labels.csv")
test_data = pd.read_csv("test.csv")

In [266]:
#Encode Labels
le = preprocessing.LabelEncoder()
train_labels['label'] = le.fit_transform(train_labels.gender.values)
train_labels

Unnamed: 0,id,gender,label
0,d7d392835f50664fc079f0f388e147a0,male,1
1,ee40b86368137b86f51806c9f105b34b,female,0
2,919bc742d9a22d65eab1f52b11656cab,male,1
3,15b97a08d65f22d97ca685686510b6ae,female,0
4,affa98421ef5c46ca7c8f246e0a134c1,female,0
...,...,...,...
3095,97159e619b8d88bdd837f7f7e738de43,male,1
3096,9bccadb3d0033a2b2ad4403184ea72f5,female,0
3097,f252cb406d4c27e71414148175fe6878,female,0
3098,5dcf483c6ceb4cdf9de1648486f28706,female,0


In [270]:
def parse_training_docs(data):
    train_data = data
    for index, row in train_labels.iterrows():
        doc_string = ""
        tree = ET.parse('data/'+row['id']+'.xml')
        document = tree.getroot().find("documents").findall("document")
        for doc in document:
            doc_string = doc_string + " " + doc.text
        train_labels.loc[index,'document'] = doc_string
    return train_data

train_labels = parse_training_docs(train_labels)

In [271]:
train_data, validation_data = train_test_split(train_labels, test_size=0.2)

In [272]:
train_data.head()

Unnamed: 0,id,gender,label,document
1783,87483c7bbfb84744bed74199f986d9d0,male,1,Fucking A!!! @cardiffdevils @FarmerG5 @SamDun...
2577,ac196f715c1d41959ec006cb21966fc5,female,0,You don't feel like hiding in your personal c...
975,e83d63fb5c8acb7bc67e574b20fb484b,male,1,@AGhostler *if @AGhostler I would not recomme...
3062,1890ce7f4ef5c16a89ba04eef28c7756,male,1,"@realDonaldTrump pathetic leadership “Still, ..."
163,85b7b6367d73f9c7a5f4db51312020e4,female,0,@nypost by weighing they mean they have a spe...


In [273]:
print("The number of Training documents are: ", len(train_data))
print("The number of Validation documents are: ", len(validation_data))

The number of Training documents are:  2480
The number of Validation documents are:  620


In [274]:
train_docs = train_data["document"] .tolist()
train_labels = train_data["label"].tolist()

validation_docs = validation_data["document"] .tolist()
validation_labels = validation_data["label"].tolist()

In [275]:
import re, string
def clean_documents(document):
    # Remove square brackets characters
#     document = re.sub(r'\[.*?\]', '', document)
    #Remove URL
    document = re.sub(r'http\S+', '', document)
    #Remove user mentions
    document = re.sub(r"@(\w+)", ' ', document, flags=re.MULTILINE)
    #Remove punctuations
    document = re.sub(r'[%s]' % re.escape(string.punctuation), '', document)
    #Remove words with numbers
    document = re.sub(r'\w*\d\w*', '', document)  
    #Remove triple dots
    document = document.replace("…", " ")
    #Remove Quotes
    document = document.replace("'", "")
    document = document.replace("\"", "")
    document = document.replace("’", "")
    document = document.replace("“", "")
    document = document.replace("”", "")
    #convert to lowercase
    document = document.lower()
    return document

In [276]:
train_docs = [clean_documents(doc) for doc in train_docs]
validation_docs = [clean_documents(doc) for doc in validation_docs]

In [277]:
#Tokenize the documents
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()
train_docs = [tknzr.tokenize(doc) for doc in train_docs]
validation_docs = [tknzr.tokenize(doc) for doc in validation_docs]

In [278]:
#Lemmatize and Remove stop words
stopwords_list = stopwords.words('english')
wnl = WordNetLemmatizer()
train_docs = [[ wnl.lemmatize(token) for token in doc if token not in stopwords_list and len(token) > 0] for doc in train_docs]

validation_docs = [[ wnl.lemmatize(token) for token in doc if token not in stopwords_list and len(token) > 0] for doc in validation_docs]


In [279]:
#Remove empty words after lemmatize
train_docs = [[ token for token in doc if len(token) > 0] for doc in train_docs]
validation_docs = [[ token for token in doc if len(token) > 0] for doc in validation_docs]

In [280]:
def dummy_fun(doc):
    return doc

vectorizer = TfidfVectorizer(
    analyzer='word',
    input='content',
    token_pattern='(?u)\\b\\w\\w+\\b',
    min_df = 5,
    tokenizer=dummy_fun,
    preprocessor=dummy_fun) 

In [281]:
x_train=vectorizer.fit_transform(train_docs)
y_train=np.asarray(train_labels)

x_valid=vectorizer.transform(validation_docs)
y_valid=np.asarray(validation_labels)

In [282]:
vectorizer.get_feature_names()

['aa',
 'aaa',
 'aaaaand',
 'aaaand',
 'aaah',
 'aaliyah',
 'aampe',
 'aaron',
 'ab',
 'abandon',
 'abandoned',
 'abandoning',
 'abba',
 'abbey',
 'abbotsford',
 'abbott',
 'abbotts',
 'abby',
 'abc',
 'abducted',
 'abduction',
 'abdullah',
 'abe',
 'abel',
 'aberdeen',
 'abhorrent',
 'abiding',
 'abigail',
 'ability',
 'able',
 'abnormal',
 'aboard',
 'abolish',
 'abolished',
 'abomination',
 'aboot',
 'aboriginal',
 'aborted',
 'abortion',
 'abou',
 'abound',
 'abounds',
 'abraham',
 'abroad',
 'abrupt',
 'abruptly',
 'absence',
 'absent',
 'abso',
 'absolute',
 'absolutely',
 'absorb',
 'absorbed',
 'abstract',
 'absurd',
 'absurdity',
 'absurdly',
 'abt',
 'abu',
 'abundance',
 'abuse',
 'abused',
 'abuser',
 'abusing',
 'abusive',
 'abysmal',
 'abyss',
 'ac',
 'aca',
 'academia',
 'academic',
 'academy',
 'acc',
 'acca',
 'accelerate',
 'accelerates',
 'accelerating',
 'accelerator',
 'accent',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'accepts',
 'acces

### LOGISTIC REGRESSION

In [283]:
logistic_regression_model = LogisticRegression()

CV = 10
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []

model_name = logistic_regression_model.__class__.__name__
accuracies = cross_val_score(logistic_regression_model, x_train, y_train, scoring='accuracy', cv=CV)
for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

cv_df

Unnamed: 0,model_name,fold_idx,accuracy
0,LogisticRegression,0,0.826613
1,LogisticRegression,1,0.850806
2,LogisticRegression,2,0.798387
3,LogisticRegression,3,0.754032
4,LogisticRegression,4,0.822581
5,LogisticRegression,5,0.794355
6,LogisticRegression,6,0.806452
7,LogisticRegression,7,0.818548
8,LogisticRegression,8,0.802419
9,LogisticRegression,9,0.754032


In [286]:
#Measure performance on validation data
logistic_regression_model.fit(x_train, y_train)
print(model_name)
# Do the prediction
y_predict=logistic_regression_model.predict(x_valid)
print(confusion_matrix(y_valid,y_predict))
recall=recall_score(y_valid,y_predict,average='macro')
precision=precision_score(y_valid,y_predict,average='macro')
f1score=f1_score(y_valid,y_predict,average='macro')
accuracy=accuracy_score(y_valid,y_predict)
matthews = matthews_corrcoef(y_valid,y_predict) 
print('Accuracy: '+ str(accuracy))
print('Macro Precision: '+ str(precision))
print('Macro Recall: '+ str(recall))
print('Macro F1 score:'+ str(f1score))
print('MCC:'+ str(matthews))

LogisticRegression
[[239  89]
 [ 57 235]]
Accuracy: 0.7645161290322581
Macro Precision: 0.7663705372038705
Macro Recall: 0.7667265285666556
Macro F1 score:0.764506327006327
MCC:0.5330969469086221
