## Main script of gender classification on Twitter
### The main script can be run in one go, but all the chunks that also have their own script, have a header here. So as not to lose track of the process
### Note: All csv files can be found in the 'Data' folder for all intermediate results

#### Script 1. Cleaning the data

In [52]:
#Creating two lists containing male and female tweets, plus two counts to be able to split them in a 70/20/10 division.

import csv
with open("gender-classifier-DFE-791531.csv", "r", encoding='utf8', errors='ignore') as csvfile:
    tweet_reader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
    tweet_dicts = [dict(d) for d in tweet_reader]

male_tweets_count = 0
female_tweets_count = 0

male_tweet_dicts = []
female_tweet_dicts = []

for item in tweet_dicts:
    if item['gender'] == 'male':
        male_tweets_count += 1
        male_tweet_dicts.append(item)
    if item['gender'] == 'female':
        female_tweets_count += 1
        female_tweet_dicts.append(item)

#### Script 2. Splitting the data

In [None]:
#Creating a training, validation and test set.        

training_list_male = male_tweet_dicts[0:4335]
training_list_female = female_tweet_dicts[0:4690]
training_list = training_list_male + training_list_female

validation_list_male = male_tweet_dicts[4336:4955]
validation_list_female = female_tweet_dicts[4690:5360]
validation_list = validation_list_male + validation_list_female

test_list_male = male_tweet_dicts[4956:6194]
test_list_female = female_tweet_dicts[5361:6700]
test_list = test_list_male + test_list_female

#### Script 3. Dowloading all relevant packages

In [187]:
import spacy
nlp = spacy.load('en')

import nltk
nltk.downloader.download('vader_lexicon')
from spacy.tokens import Doc
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentiment_analyzer = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lisamarkslag/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


#### Script 4. Processing the data - selecting all the relevant information

In [263]:
#Creating a list of dictionaries, only with the relevant information (for training).

updated_training_list = []
updated_validation_list = []
updated_test_list = []

for dictionary in training_list:
    dict_tweets = {}
    for column_name, value in dictionary.items():
        if column_name in ['gender', 'text', 'retweet_count', 'created']:
            dict_tweets[column_name] = value
    updated_training_list.append(dict_tweets)
    
for dictionary in validation_list:
    validation_tweet_dicts = {}
    for column_name, value in dictionary.items():
        if column_name in ['gender', 'text', 'retweet_count', 'created']:
            validation_tweet_dicts[column_name] = value
    updated_validation_list.append(validation_tweet_dicts)
    
for dictionary in test_list:
    test_tweet_dicts = {}
    for column_name, value in dictionary.items():
        if column_name in ['gender', 'text', 'retweet_count', 'created']:
            test_tweet_dicts[column_name] = value
    updated_test_list.append(test_tweet_dicts)


#### Script 5. Saving all the datasets (training, validation and test)

In [None]:
#saving training set as csv
training_set = "training_set.csv"
with open(training_set, "w") as outfile_training:
    fieldnames = ['gender', 'created', 'retweet_count', 'text']
    writer = csv.DictWriter(outfile_training, fieldnames = fieldnames)
    writer.writeheader()
    for dictionary in updated_training_list:
        writer.writerow({'gender': dictionary['gender'], 'created': dictionary['created'], 'retweet_count': dictionary['retweet_count'], 'text': dictionary['text']})

#saving validation set as csv
validation_set = "validation_set.csv"
with open(validation_set, "w") as outfile_validation:
    fieldnames = ['gender', 'created', 'retweet_count', 'text']
    writer = csv.DictWriter(outfile_validation, fieldnames = fieldnames)
    writer.writeheader()
    for dictionary in updated_validation_list:
        writer.writerow({'gender': dictionary['gender'], 'created': dictionary['created'], 'retweet_count': dictionary['retweet_count'], 'text': dictionary['text']})

#saving test set as csv
test_set = "test_set.csv"
with open(test_set, "w") as outfile_test:
    fieldnames = ['gender', 'created', 'retweet_count', 'text']
    writer = csv.DictWriter(outfile_test, fieldnames = fieldnames)
    writer.writeheader()
    for dictionary in updated_test_list:
        writer.writerow({'gender': dictionary['gender'], 'created': dictionary['created'], 'retweet_count': dictionary['retweet_count'], 'text': dictionary['text']})
        


#### Script 6. All the functions to calculate features

In [189]:
#Defining functions to call while storing the data as a csv

#Punctuation and Numbers

def num_comma(text_tweet):
    """Counting the number of comma's in a tweet"""
    num_comma = text_tweet.count(',')
    return(num_comma)

def num_period(text_tweet):
    """Counting the number of periods in a tweet"""
    num_period = text_tweet.count('.')
    return(num_period)

def num_question_mark(text_tweet):
    """Counting the number of question marks in a tweet"""
    num_question_mark = text_tweet.count('?')
    return(num_question_mark)

def num_lowercase(text_tweet):
    """Counting the number of lowercase letters in a tweet"""
    num_lowercase = sum(1 for c in text_tweet if c.islower())
    return(num_lowercase)

def num_uppercase(text_tweet):
    """Counting the number of uppercase letters in a tweet"""
    num_uppercase = sum(1 for c in text_tweet if c.isupper())
    return(num_uppercase)

def num_numeric(text_tweet):
    """Counting the number of numeric characters in a tweet"""
    num_numeric = sum(1 for c in text_tweet if c.isnumeric())
    return(num_numeric)

def count_symbols(text_tweet):
    """Counting the amount of symbols in a tweet"""
    import string
    import collections as ct
    special_chars = string.punctuation
    num_symbols = sum(v for k, v in ct.Counter(text_tweet).items() if k in special_chars)
    return(num_symbols)

def count_exl_marks(text_tweet):
    """Counting exclamation marks in a tweet"""
    count_exl_mark = text_tweet.count("!")
    return(count_exl_mark)

#Extra-linguistic features

def count_reply_tweet(text_tweet):
    """Counting the amount of times an author replies to someone using '@'"""
    count_reply = text_tweet.count('@')
    return(count_reply)

def contains_links(text_tweet):
    """Determining if a tweet contains a link"""
    if "https:" in text_tweet:
        contains_link = 1
    if "https:" not in text_tweet:
        contains_link = 0
    return(contains_link)

def contains_smiley(text_tweet):
    """Determining if a tweet contains a happy smiley"""
    if " :) " in text_tweet:
        contains_smiley = 1
    if " :) " not in text_tweet:
        contains_smiley = 0
    return(contains_smiley)

def num_hashtags(text_tweet):
    """Counting the number of hashtags in a tweet"""
    num_hashtags = text_tweet.count('#')
    return(num_hashtags)

#Linguistic

def count_number_verbs(text_tweet):
    """Counting the number of verbs in a tweet"""
    doc = nlp(text_tweet)
    pos_tags = []
    for token in doc: 
        pos_tag = token.pos_
        pos_tags.append(pos_tag)
    count_verbs = pos_tags.count("VERB")
    return(count_verbs)

def count_number_nouns(text_tweet):
    """Counting the number of nouns in a tweet"""
    doc = nlp(text_tweet)
    pos_tags = []
    for token in doc: 
        pos_tag = token.pos_
        pos_tags.append(pos_tag)
    count_nouns = pos_tags.count("NOUN")
    return(count_nouns)

def count_tokens(text_tweet):
    """Counting the amount of tokens (words) in a tweet"""
    doc = nlp(text_tweet)
    num_tokens = len(doc)
    return(num_tokens)

def num_like(text_tweet):
    """Counting the amount of times the author uses the word 'like'"""
    number_likes = text_tweet.count(" like ") + text_tweet.count(" Like ") + text_tweet.count("Like ")
    return(number_likes)

def num_first_person(text_tweet):
    """Counting the number of first person singular expressions"""
    num_I = text_tweet.count("I ") + text_tweet.count(" I ") + text_tweet.count(" i ")
    num_my = text_tweet.count("My ") + text_tweet.count(" My ") + text_tweet.count(" my ") + text_tweet.count("my ")
    num_Im = text_tweet.count(" I'm ") + text_tweet.count("I'm ") + text_tweet.count(" Im ") + text_tweet.count(" i'm ") + text_tweet.count(" im ")
    num_Ihave = text_tweet.count(" I have ") + text_tweet.count("I have ") + text_tweet.count(" I Have ") + text_tweet.count("I Have ")
    num_me = text_tweet.count(" me ") + text_tweet.count(" Me ") + text_tweet.count("Me ")
    num_Iam = text_tweet.count(" I am ") + text_tweet.count("I am ")
    num_first_person = num_I + num_my + num_Im + num_Ihave + num_me + num_Iam
    return(num_first_person)

#Sentiment

def positive_scores(text_tweet):
    """Measuring the positive sentiment of a text in a tweet"""
    doc=nlp(text_tweet)
    doc.set_extension('polarity_scores', getter=polarity_scores, force=True)
    polarity_score = doc._.polarity_scores
    polarity_scores(doc)
    return(polarity_score["pos"])


def negative_scores(text_tweet):
    """Measuring the negative sentiment of a text in a tweet"""
    doc=nlp(text_tweet)
    doc.set_extension('polarity_scores', getter=polarity_scores, force=True)
    polarity_score = doc._.polarity_scores
    polarity_scores(doc)
    return(polarity_score["neg"])

def neutral_scores(text_tweet):
    """Measuring the neutral sentiment of a text in a tweet"""
    doc=nlp(text_tweet)
    doc.set_extension('polarity_scores', getter=polarity_scores, force=True)
    polarity_score = doc._.polarity_scores
    polarity_scores(doc)
    return(polarity_score["neu"])



#### Script 7. Storing and opening the training data

In [261]:
#Storing the training data as csv

outfilename = "training_set_features.csv"
with open(outfilename, "w") as outfile:
    fieldnames = ['gender', 'created', 'retweet_count', 'text', 'num_exclamation', 'num_tokens', 'num_replys', 'contains_link', 'num_verbs', 'num_nouns', 'neg_score', 'pos_score', 'neu_score', 'num_commas', 'num_period', 'num_question_mark', 'num_upper', 'num_lower', 'num_num', 'contains_smiley', 'num_likes', 'num_first_person', 'num_hashtags', 'num_symbols']
    writer = csv.DictWriter(outfile, fieldnames = fieldnames)
    writer.writeheader()
    for dictionary in updated_training_list:
        exl_count = count_exl_marks(dictionary["text"])
        token_count = count_tokens(dictionary["text"])
        reply_count = count_reply_tweet(dictionary["text"])
        link_container = contains_links(dictionary["text"])
        verb_count = count_number_verbs(dictionary["text"])
        noun_count = count_number_nouns(dictionary["text"])
        neg_score = negative_scores(dictionary["text"])
        pos_score = positive_scores(dictionary["text"])
        neu_score = neutral_scores(dictionary["text"])
        comma_count = num_comma(dictionary["text"])
        period_count = num_period(dictionary["text"])
        question_count = num_question_mark(dictionary["text"])
        uppercase_count = num_uppercase(dictionary["text"])
        lowercase_count = num_lowercase(dictionary["text"])
        num_count = num_numeric(dictionary["text"])
        smiley_container = contains_smiley(dictionary["text"])
        like_count = num_like(dictionary["text"])
        first_person_count = num_first_person(dictionary["text"])
        hashtag_count = num_hashtags(dictionary["text"])
        symbol_count = count_symbols(dictionary["text"])
        writer.writerow({'gender': dictionary['gender'], 'created': dictionary['created'], 'retweet_count': dictionary['retweet_count'], 'text': dictionary["text"], 'num_exclamation': exl_count, 'num_tokens': token_count, 'num_replys': reply_count, 'contains_link': link_container, 'num_verbs': verb_count, 'num_nouns': noun_count, 'neg_score': neg_score, 'pos_score': pos_score, 'neu_score': neu_score, 'num_commas': comma_count, 'num_period': period_count, 'num_question_mark': question_count, 'num_upper': uppercase_count, 'num_num': num_count, 'contains_smiley': smiley_container, 'num_likes': like_count, 'num_first_person': first_person_count, 'num_hashtags': hashtag_count, 'num_lower': lowercase_count, 'num_symbols': symbol_count})
        

In [1]:
#Opening the training data
        
with open("training_set_features.csv") as csvfile2:
    tweet_reader_2 = csv.DictReader(csvfile2, delimiter=',', quotechar='"')
    tweet_dicts_2 = [dict(d) for d in tweet_reader_2]
    

FileNotFoundError: [Errno 2] No such file or directory: '../../../../../../Desktop/training_set_features.csv'

#### Script 8. Building the classifier without specified weights

In [None]:
def classifier(tweet):
    ft1 = count_reply_tweet(tweet["text"])
    ft2 = contains_links(tweet["text"])
    ft3 = contains_smiley(tweet["text"])
    ft4 = num_hashtags(tweet["text"])
    ft5 = count_number_nouns(tweet["text"])
    ft6 = count_number_verbs(tweet["text"])
    ft7 = count_tokens(tweet["text"])
    ft8 = num_like(tweet["text"])
    ft9 = num_first_person(tweet["text"])
    ft10 = num_lowercase(tweet["text"])
    ft11 = num_uppercase(tweet["text"])
    ft12 = positive_scores(tweet["text"])
    ft13 = negative_scores(tweet["text"])
    ft14 = neutral_scores(tweet["text"])
    ft15 = num_comma(tweet["text"])
    ft16 = num_period(tweet["text"])
    ft17 = num_question_mark(tweet["text"])
    ft18 = count_exl_marks(tweet["text"])
    ft19 = num_numeric(tweet["text"])
    ft20 = count_symbols(tweet["text"])
    distance_fm1 = abs(fm1 - ft1)
    distance_ff1 = abs(ff1 - ft1) 
    distance_fm2 = abs(fm2 - ft2)
    distance_ff2 = abs(ff2 - ft2)
    distance_fm3 = abs(fm3 - ft3)
    distance_ff3 = abs(ff3 - ft3)
    distance_fm4 = abs(fm4 - ft4)
    distance_ff4 = abs(ff4 - ft4) 
    distance_fm5 = abs(fm5 - ft5)
    distance_ff5 = abs(ff5 - ft5) 
    distance_fm6 = abs(fm6 - ft6)
    distance_ff6 = abs(ff6 - ft6) 
    distance_fm7 = abs(fm7 - ft7)
    distance_ff7 = abs(ff7 - ft7) 
    distance_fm8 = abs(fm8 - ft8)
    distance_ff8 = abs(ff8 - ft8)
    distance_fm9 = abs(fm9 - ft9)
    distance_ff9 = abs(ff9 - ft9)
    distance_fm10 = abs(fm10 - ft10)
    distance_ff10 = abs(ff10 - ft10)
    distance_fm11 = abs(fm11 - ft11)
    distance_ff11 = abs(ff11 - ft11)
    distance_fm12 = abs(fm12 - ft12)
    distance_ff12 = abs(ff12 - ft12)
    distance_fm13 = abs(fm13 - ft13)
    distance_ff13 = abs(ff13 - ft13)
    distance_fm14 = abs(fm14 - ft14)
    distance_ff14 = abs(ff14 - ft14)
    distance_fm15 = abs(fm15 - ft15)
    distance_ff15 = abs(ff15 - ft15)
    distance_fm16 = abs(fm16 - ft16)
    distance_ff16 = abs(ff16 - ft16)
    distance_fm17 = abs(fm17 - ft17)
    distance_ff17 = abs(ff17 - ft17)
    distance_fm18 = abs(fm18 - ft18)
    distance_ff18 = abs(ff18 - ft18)
    distance_fm19 = abs(fm19 - ft19)
    distance_ff19 = abs(ff19 - ft19)
    distance_fm20 = abs(fm20 - ft20)
    distance_ff20 = abs(ff20 - ft20)
    
    total_distance_m = distance_fm1 + distance_fm2 + distance_fm3 + distance_fm4 + distance_fm5 + distance_fm6 + distance_fm7 + distance_fm8 + distance_fm9 + distance_fm10 + distance_fm11 + distance_fm12 + distance_fm13 + distance_fm14 + distance_fm15 + distance_fm16 + distance_fm17 + distance_fm18 + distance_fm19 + distance_fm20 
    total_distance_f = distance_ff1 + distance_ff2 + distance_ff3 + distance_ff4 + distance_ff5 + distance_ff6 + distance_ff7 + distance_ff8 + distance_ff9 + distance_ff10 + distance_ff11 + distance_ff12 + distance_ff13 + distance_ff14 + distance_ff15 + distance_ff16 + distance_ff17 + distance_ff18 + distance_ff19 + distance_ff20
    
    if total_distance_m > total_distance_f:
        prediction = "female"
    if total_distance_f > total_distance_m:
        prediction = "male"
    
    return(prediction)

#### Script 9. Building the classifier with specified weights and including only relevant features

In [298]:
def classifierweights(tweet):
    ft2 = contains_links(tweet["text"])
    ft4 = num_hashtags(tweet["text"])
    ft5 = count_number_nouns(tweet["text"])
    ft6 = count_number_verbs(tweet["text"])
    ft7 = count_tokens(tweet["text"])
    ft9 = num_first_person(tweet["text"])
    ft11 = num_uppercase(tweet["text"])
    ft12 = positive_scores(tweet["text"])
    ft15 = num_comma(tweet["text"])
    ft16 = num_period(tweet["text"])
    ft17 = num_question_mark(tweet["text"])
    ft18 = count_exl_marks(tweet["text"])
    ft19 = num_numeric(tweet["text"])
    distance_fm2 = abs(fm2 - ft2)
    distance_ff2 = abs(ff2 - ft2)
    distance_fm4 = abs(fm4 - ft4)
    distance_ff4 = abs(ff4 - ft4) 
    distance_fm5 = abs(fm5 - ft5)
    distance_ff5 = abs(ff5 - ft5) 
    distance_fm6 = abs(fm6 - ft6)
    distance_ff6 = abs(ff6 - ft6) 
    distance_fm7 = abs(fm7 - ft7)
    distance_ff7 = abs(ff7 - ft7) 
    distance_fm9 = abs(fm9 - ft9)
    distance_ff9 = abs(ff9 - ft9)
    distance_fm11 = abs(fm11 - ft11)
    distance_ff11 = abs(ff11 - ft11)
    distance_fm12 = abs(fm12 - ft12)
    distance_ff12 = abs(ff12 - ft12)
    distance_fm15 = abs(fm15 - ft15)
    distance_ff15 = abs(ff15 - ft15)
    distance_fm16 = abs(fm16 - ft16)
    distance_ff16 = abs(ff16 - ft16)
    distance_fm17 = abs(fm17 - ft17)
    distance_ff17 = abs(ff17 - ft17)
    distance_fm18 = abs(fm18 - ft18)
    distance_ff18 = abs(ff18 - ft18)
    distance_fm19 = abs(fm19 - ft19)
    distance_ff19 = abs(ff19 - ft19)
    
    total_distance_m = distance_fm2 + distance_fm4 + distance_fm5 + distance_fm6 + distance_fm7 + distance_fm9*50 + distance_fm11 + distance_fm12*10 + distance_fm15 + distance_fm16 + distance_fm17 + distance_fm18 + distance_fm19 
    total_distance_f = distance_ff2 + distance_ff4 + distance_ff5 + distance_ff6 + distance_ff7 + distance_ff9*50 + distance_ff11 + distance_ff12*10 + distance_ff15 + distance_ff16 + distance_ff17 + distance_ff18 + distance_ff19
    
    if total_distance_m > total_distance_f:
        prediction = "female"
    if total_distance_f > total_distance_m:
        prediction = "male"
    
    return(prediction)

#### Script 10. CSV-files for validation -and test set (first without specified weights, second with specified weights and relevant features)

In [274]:
#creating csv files for calculating the accuracy (using classifier without weights and all features)

validation_prediction_noweights = "validation_noweights_prediction.csv"
with open(validation_prediction_noweights, "w") as validation_noweights_outfile:
    fieldnames = ['text_tweet', 'gold', 'prediction']
    writer = csv.DictWriter(validation_noweights_outfile, fieldnames = fieldnames)
    writer.writeheader()
    for dictionary in validation_list:
        classification = classifier(dictionary)
        writer.writerow({'text_tweet': dictionary['text'], 'gold': dictionary['gender'], 'prediction': classification})
        
test_prediction_noweights = "test_noweights_prediction.csv"
with open(test_prediction_noweights, "w") as test_noweights_outfile:
    fieldnames = ['text_tweet', 'gold', 'prediction']
    writer = csv.DictWriter(test_noweights_outfile, fieldnames = fieldnames)
    writer.writeheader()
    for dictionary in test_list:
        classification = classifier(dictionary)
        writer.writerow({'text_tweet': dictionary['text'], 'gold': dictionary['gender'], 'prediction': classification})

In [299]:
#creating csv files for calculating the accuracy (using classifier with weights and 13 features)

validation_prediction_weights = "validation_weights_prediction.csv"
with open(validation_prediction_weights, "w") as validation_weights_outfile:
    fieldnames = ['text_tweet', 'gold', 'prediction']
    writer = csv.DictWriter(validation_weights_outfile, fieldnames = fieldnames)
    writer.writeheader()
    for dictionary in validation_list:
        classification = classifierweights(dictionary)
        writer.writerow({'text_tweet': dictionary['text'], 'gold': dictionary['gender'], 'prediction': classification})

test_prediction_weights = "test_weights_prediction.csv"
with open(test_prediction_weights, "w") as test_weights_outfile:
    fieldnames = ['text_tweet', 'gold', 'prediction']
    writer = csv.DictWriter(test_weights_outfile, fieldnames = fieldnames)
    writer.writeheader()
    for dictionary in test_list:
        classification = classifierweights(dictionary)
        writer.writerow({'text_tweet': dictionary['text'], 'gold': dictionary['gender'], 'prediction': classification})

#### Script 11. Calculating the accuracy of classifier (without and with specified weights)

In [2]:
#opening the csvs and calculating the accuracy of the classifier while looking at different sets

with open("validation_noweights_prediction.csv") as csvfile3:
    prediction_reader = csv.DictReader(csvfile3, delimiter=',', quotechar='"')
    prediction_dicts = [dict(d) for d in prediction_reader]

with open("test_noweights_prediction.csv") as csvfile4:
    prediction_reader_2 = csv.DictReader(csvfile4, delimiter=',', quotechar='"')
    prediction_dicts_2 = [dict(d) for d in prediction_reader_2]
    
with open("validation_weights_prediction.csv") as csvfile5:
    prediction_reader_3 = csv.DictReader(csvfile5, delimiter=',', quotechar='"')
    prediction_dicts_3 = [dict(d) for d in prediction_reader_3]
    
with open("test_weights_prediction.csv") as csvfile6:
    prediction_reader_4 = csv.DictReader(csvfile6, delimiter=',', quotechar='"')
    prediction_dicts_4 = [dict(d) for d in prediction_reader_4]

def accuracy(list_of_tweets):
    correct_count = 0
    total_count = len(list_of_tweets)
    for tweet in list_of_tweets:
        if tweet['gold'] == tweet['prediction']:
            correct_count += 1
    accuracy = correct_count/total_count*100
    return(accuracy)

print("The accuracy of the classifier with all (20) features and without weights on the validation set is:", accuracy(prediction_dicts), "\nThe accuracy of the classifier with all (20) features and without weights on the test set is:", accuracy(prediction_dicts_2), "\nThe accuracy of the classifier with only 13 features and with weights on feature 9 and 12 on the validation set is:", accuracy(prediction_dicts_3), "\nThe accuracy of the classifier with only 13 features and with weights on feature 9 and 12 on the test set is:", accuracy(prediction_dicts_4))


FileNotFoundError: [Errno 2] No such file or directory: '../../../../../../Desktop/validation_noweights_prediction.csv'