## Step 2: Split your Data into Training/Validation/Test

In [52]:
#Creating two lists containing male and female tweets, plus two counts to be able to split them in a 70/20/10 division.

import csv
with open("../../../../../../Desktop/gender-classifier-DFE-791531.csv", "r", encoding='utf8', errors='ignore') as csvfile:
    tweet_reader = csv.DictReader(csvfile, delimiter=',', quotechar='"')
    tweet_dicts = [dict(d) for d in tweet_reader]

male_tweets_count = 0
female_tweets_count = 0

male_tweet_dicts = []
female_tweet_dicts = []

for item in tweet_dicts:
    if item['gender'] == 'male':
        male_tweets_count += 1
        male_tweet_dicts.append(item)
    if item['gender'] == 'female':
        female_tweets_count += 1
        female_tweet_dicts.append(item)

        
#Creating a training, validation and test set.        

training_list_male = male_tweet_dicts[0:4335]
training_list_female = female_tweet_dicts[0:4690]
training_list = training_list_male + training_list_female

validation_list_male = male_tweet_dicts[4336:4955]
validation_list_female = female_tweet_dicts[4690:5360]
validation_list = validation_list_male + validation_list_female

test_list_male = male_tweet_dicts[4956:6194]
test_list_female = female_tweet_dicts[5361:6700]
test_list = test_list_male + test_list_female

In [53]:
import spacy
nlp = spacy.load('en')

import nltk
nltk.downloader.download('vader_lexicon')
from spacy.tokens import Doc
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sentiment_analyzer = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/lisamarkslag/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Step 3: Read and Process the Files

In [54]:
#Creating a list of dictionaries, only with the relevant information (for training).

updated_training_list = []
updated_validation_list = []

for dictionary in training_list:
    dict_tweets = {}
    for column_name, value in dictionary.items():
        if column_name in ['gender', 'text', 'retweet_count', 'created']:
            dict_tweets[column_name] = value
    updated_training_list.append(dict_tweets)
    
for dictionary in validation_list:
    validation_tweet_dicts = {}
    for column_name, value in dictionary.items():
        if column_name in ['gender', 'text', 'retweet_count', 'created']:
            validation_tweet_dicts[column_name] = value
    updated_validation_list.append(validation_tweet_dicts)
    

In [130]:
#Defining functions to call while storing the data as a csv

#Extra-linguistic features

def count_reply_tweet(text_tweet):
    count_reply = text_tweet.count('@')
    return(count_reply)

def contains_links(text_tweet):
    if "https:" in text_tweet:
        contains_link = 1
    if "https:" not in text_tweet:
        contains_link = 0
    return(contains_link)

def contains_smiley(text_tweet):
    if " :) " in text_tweet:
        contains_smiley = 1
    if " :) " not in text_tweet:
        contains_smiley = 0
    return(contains_smiley)


#Linguistic

def count_number_verbs(text_tweet):
    doc = nlp(text_tweet)
    pos_tags = []
    for token in doc: 
        pos_tag = token.pos_
        pos_tags.append(pos_tag)
    count_verbs = pos_tags.count("VERB")
    return(count_verbs)

def count_number_nouns(text_tweet):
    doc = nlp(text_tweet)
    pos_tags = []
    for token in doc: 
        pos_tag = token.pos_
        pos_tags.append(pos_tag)
    count_nouns = pos_tags.count("NOUN")
    return(count_nouns)

def count_tokens(text_tweet):
    doc = nlp(text_tweet)
    num_tokens = len(doc)
    return(num_tokens)

def num_like(text_tweet):
    number_likes = text_tweet.count(" like ") + text_tweet.count(" Like ") + text_tweet.count("Like ")
    return(number_likes)

def num_first_person(text_tweet):
    """Counting the number of first person singular expressions"""
    num_I = text_tweet.count("I ") + text_tweet.count(" I ") + text_tweet.count(" i ")
    num_my = text_tweet.count("My ") + text_tweet.count(" My ") + text_tweet.count(" my ") + text_tweet.count("my ")
    num_Im = text_tweet.count(" I'm ") + text_tweet.count("I'm ") + text_tweet.count(" Im ") + text_tweet.count(" i'm ") + text_tweet.count(" im ")
    num_Ihave = text_tweet.count(" I have ") + text_tweet.count("I have ") + text_tweet.count(" I Have ") + text_tweet.count("I Have ")
    num_me = text_tweet.count(" me ") + text_tweet.count(" Me ") + text_tweet.count("Me ")
    num_Iam = text_tweet.count(" I am ") + text_tweet.count("I am ")
    num_first_person = num_I + num_my + num_Im + num_Ihave + num_me + num_Iam
    return(num_first_person)


#Sentiment

def positive_scores(text_tweet):
    doc=nlp(text_tweet)
    doc.set_extension('polarity_scores', getter=polarity_scores, force=True)
    polarity_score = doc._.polarity_scores
    polarity_scores(doc)
    return(polarity_score["pos"])


def negative_scores(text_tweet):
    doc=nlp(text_tweet)
    doc.set_extension('polarity_scores', getter=polarity_scores, force=True)
    polarity_score = doc._.polarity_scores
    polarity_scores(doc)
    return(polarity_score["neg"])

def neutral_scores(text_tweet):
    doc=nlp(text_tweet)
    doc.set_extension('polarity_scores', getter=polarity_scores, force=True)
    polarity_score = doc._.polarity_scores
    polarity_scores(doc)
    return(polarity_score["neu"])



#Punctuation and Numbers

def num_comma(text_tweet):
    num_comma = text_tweet.count(',')
    return(num_comma)

def num_period(text_tweet):
    num_period = text_tweet.count('.')
    return(num_period)

def num_question_mark(text_tweet):
    num_question_mark = text_tweet.count('?')
    return(num_question_mark)

def num_lowercase(text_tweet):
    num_lowercase = sum(1 for c in text_tweet if c.islower())
    return(num_lowercase)

def num_uppercase(text_tweet):
    num_uppercase = sum(1 for c in text_tweet if c.isupper())
    return(num_uppercase)

def num_numeric(text_tweet):
    num_numeric = sum(1 for c in text_tweet if c.isnumeric())
    return(num_numeric)

def count_symbols(text_tweet):
    import string
    import collections as ct
    special_chars = string.punctuation
    num_symbols = sum(v for k, v in ct.Counter(text_tweet).items() if k in special_chars)
    return(num_symbols)

def count_exl_marks(text_tweet):
    count_exl_mark = text_tweet.count("!")
    return(count_exl_mark)

def count_at_statements(text_tweet):
    count_at_statements = text_tweet.count("@")
    return(count_at_statements)


In [131]:
#Storing the data as csv

outfilename = "../../../../../../Desktop/training_set.csv"
with open(outfilename, "w") as outfile:
    fieldnames = ['gender', 'created', 'retweet_count', 'text', 'num_exclamation', 'num_tokens', 'num_replys', 'contains_link', 'num_verbs', 'num_nouns', 'num_ats', 'neg_score', 'pos_score', 'neu_score', 'num_commas', 'num_period', 'num_question_mark', 'num_upper', 'num_num', 'contains_smiley', 'num_likes', 'num_first_person']
    writer = csv.DictWriter(outfile, fieldnames = fieldnames)
    writer.writeheader()
    for dictionary in updated_training_list:
        exl_count = count_exl_marks(dictionary["text"])
        token_count = count_tokens(dictionary["text"])
        reply_count = count_reply_tweet(dictionary["text"])
        link_container = contains_links(dictionary["text"])
        verb_count = count_number_verbs(dictionary["text"])
        noun_count = count_number_nouns(dictionary["text"])
        at_count = count_at_statements(dictionary["text"])
        neg_score = negative_scores(dictionary["text"])
        pos_score = positive_scores(dictionary["text"])
        neu_score = neutral_scores(dictionary["text"])
        comma_count = num_comma(dictionary["text"])
        period_count = num_period(dictionary["text"])
        question_count = num_question_mark(dictionary["text"])
        uppercase_count = num_uppercase(dictionary["text"])
        num_count = num_numeric(dictionary["text"])
        smiley_container = contains_smiley(dictionary["text"])
        like_count = num_like(dictionary["text"])
        first_person_count = num_first_person(dictionary["text"])
        writer.writerow({'gender': dictionary['gender'], 'created': dictionary['created'], 'retweet_count': dictionary['retweet_count'], 'text': dictionary["text"], 'num_exclamation': exl_count, 'num_tokens': token_count, 'num_replys': reply_count, 'contains_link': link_container, 'num_verbs': verb_count, 'num_nouns': noun_count, 'num_ats': at_count, 'neg_score': neg_score, 'pos_score': pos_score, 'neu_score': neu_score, 'num_commas': comma_count, 'num_period': period_count, 'num_question_mark': question_count, 'num_upper': uppercase_count, 'num_num': num_count, 'contains_smiley': smiley_container, 'num_likes': like_count, 'num_first_person': first_person_count})
        

In [132]:
#Opening the training data
        
with open("../../../../../../Desktop/training_set.csv") as csvfile2:
    tweet_reader_2 = csv.DictReader(csvfile2, delimiter=',', quotechar='"')
    tweet_dicts_2 = [dict(d) for d in tweet_reader_2]

print(tweet_dicts_2[0:10])

[{'gender': 'male', 'created': '12/5/13 1:48', 'retweet_count': '0', 'text': 'Robbie E Responds To Critics After Win Against Eddie Edwards In The #WorldTitleSeries https://t.co/NSybBmVjKZ', 'num_exclamation': '0', 'num_tokens': '15', 'num_replys': '0', 'contains_link': '1', 'num_verbs': '1', 'num_nouns': '4', 'num_ats': '0', 'neg_score': '0.129', 'pos_score': '0.224', 'neu_score': '0.647', 'num_commas': '0', 'num_period': '1', 'num_question_mark': '0', 'num_upper': '21', 'num_num': '0', 'contains_smiley': '0', 'num_likes': '0', 'num_first_person': '0'}, {'gender': 'male', 'created': '10/1/12 13:51', 'retweet_count': '0', 'text': 'It felt like they were my friends and I was living the story with them\u06dd https://t.co/arngE0YHNO #retired #IAN1 https://t.co/CIzCANPQFz', 'num_exclamation': '0', 'num_tokens': '21', 'num_replys': '0', 'contains_link': '1', 'num_verbs': '5', 'num_nouns': '5', 'num_ats': '0', 'neg_score': '0.0', 'pos_score': '0.259', 'neu_score': '0.741', 'num_commas': '0', 

In [135]:
#building a classifier

total_male_tweets = 0
total_female_tweets = 0

for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        total_male_tweets += 1
    if tweet["gender"] == "female":
        total_female_tweets += 1

#extra-linguistic features
#feature: number of replys

total_amount_male_reply = 0
total_amount_female_reply = 0
for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        total_amount_male_reply += int(tweet["num_replys"])
    if tweet["gender"] == "female":
        total_amount_female_reply += int(tweet["num_replys"])

average_amount_male_reply = total_amount_male_reply/total_male_tweets
average_amount_female_reply = total_amount_female_reply/total_female_tweets

fm1 = average_amount_male_reply
ff1 = average_amount_female_reply

#feature: contains links
male_links = 0
female_links = 0
for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        male_links += int(tweet["contains_link"])
    if tweet["gender"] == "female":
        female_links += int(tweet["contains_link"])

average_male_links = male_links/total_male_tweets
average_female_links = female_links/total_female_tweets            

fm2 = average_male_links
ff2 = average_female_links

#feature: contains smileys
male_smileys = 0
female_smileys = 0
for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        male_smileys += int(tweet["contains_smiley"])
    if tweet["gender"] == "female":
        female_smileys += int(tweet["contains_smiley"])

average_male_smileys = male_smileys/total_male_tweets
average_female_smileys = female_smileys/total_female_tweets

fm3 = average_male_smileys
ff3 = average_female_smileys



#linguistic features
#feature: number of nouns
total_amount_male_nouns = 0
total_amount_female_nouns = 0
for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        total_amount_male_nouns += int(tweet["num_nouns"])
    if tweet["gender"] == "female":
        total_amount_female_nouns += int(tweet["num_nouns"])

average_amount_male_nouns = total_amount_male_nouns/total_male_tweets
average_amount_female_nouns = total_amount_female_nouns/total_female_tweets

fm4 = average_amount_male_nouns
ff4 = average_amount_female_nouns

#feature: number of verbs
total_amount_male_verbs = 0
total_amount_female_verbs = 0
for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        total_amount_male_verbs += int(tweet["num_verbs"])
    if tweet["gender"] == "female":
        total_amount_female_verbs += int(tweet["num_verbs"])

average_amount_male_verbs = total_amount_male_verbs/total_male_tweets
average_amount_female_verbs = total_amount_female_verbs/total_female_tweets

fm5 = average_amount_male_verbs
ff5 = average_amount_female_verbs

#feature: number of tokens
total_amount_male_tokens = 0
total_amount_female_tokens = 0
for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        total_amount_male_tokens += int(tweet["num_tokens"])
    if tweet["gender"] == "female":
        total_amount_female_tokens += int(tweet["num_tokens"])

average_amount_male_tokens = total_amount_male_tokens/total_male_tweets
average_amount_female_tokens = total_amount_female_tokens/total_female_tweets

fm6 = average_amount_male_tokens
ff6 = average_amount_female_tokens

#feature: number of word "like": rather not use this.
total_amount_male_likes = 0
total_amount_female_likes = 0
for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        total_amount_male_likes += int(tweet["num_likes"])
    if tweet["gender"] == "female":
        total_amount_female_likes += int(tweet["num_likes"])

average_amount_male_likes = total_amount_male_tokens/total_male_tweets
average_amount_female_likes = total_amount_female_tokens/total_female_tweets

fm7 = average_amount_male_likes
ff7 = average_amount_female_likes

#feature: number of first person expressions
total_amount_male_firsts = 0
total_amount_female_firsts = 0 
for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        total_amount_male_firsts += int(tweet["num_first_person"])
    if tweet["gender"] == "female":
        total_amount_female_firsts += int(tweet["num_first_person"])

average_amount_male_firsts = total_amount_male_firsts/total_male_tweets
average_amount_female_firsts = total_amount_female_firsts/total_female_tweets

fm11 = average_amount_male_firsts
ff11 = average_amount_female_firsts

#sentiment features
#feature: negative sentiment score
total_amount_male_negscores = 0
total_amount_female_negscores = 0
for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        total_amount_male_negscores += float(tweet["neg_score"])
    if tweet["gender"] == "female":
        total_amount_female_negscores += float(tweet["neg_score"])
        
average_male_negscore = total_amount_male_negscores/total_male_tweets
average_female_negscore = total_amount_female_negscores/total_female_tweets

fm8 = average_male_negscore
ff8 = average_female_negscore

#feature: positive sentiment score
total_amount_male_posscores = 0
total_amount_female_posscores = 0
for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        total_amount_male_posscores += float(tweet["pos_score"])
    if tweet["gender"] == "female":
        total_amount_female_posscores += float(tweet["pos_score"])
        
average_male_posscore = total_amount_male_posscores/total_male_tweets
average_female_posscore = total_amount_female_posscores/total_female_tweets

fm9 = average_male_posscore
ff9 = average_female_posscore

#feature: neutral sentiment score
total_amount_male_neuscores = 0
total_amount_female_neuscores = 0
for tweet in tweet_dicts_2:
    if tweet["gender"] == "male":
        total_amount_male_neuscores += float(tweet["neu_score"])
    if tweet["gender"] == "female":
        total_amount_female_neuscores += float(tweet["neu_score"])
        
average_male_neuscore = total_amount_male_neuscores/total_male_tweets
average_female_neuscore = total_amount_female_neuscores/total_female_tweets

fm10 = average_male_neuscore
ff10 = average_female_neuscore

#features: punctuation and numbers
#feature: num @-statements
# total_amount_male_ats = 0
# total_amount_female_ats = 0
# for tweet in tweet_dicts_2:
#     if tweet["gender"] == "male":
#         total_amount_male_ats += int(tweet["num_ats"])
#     if tweet["gender"] == "female":
#         total_amount_female_ats += int(tweet["num_ats"])
        
# #feature: num exclamation marks
# total_amount_male_exl = 0
# total_amount_female_exl = 0
# for tweet in tweet_dicts_2:
#     if tweet["gender"] == "male":
#         total_amount_male_exl += int(tweet["num_exclamation"])
#     if tweet["gender"] == "female":
#         total_amount_female_exl += int(tweet["num_exclamation"])

# average_amount_male_exl = total_amount_male_exl/total_male_tweets
# average_amount_female_exl = total_amount_female_exl/total_female_tweets

# fm1 = average_amount_male_exl
# ff1 = average_amount_female_exl
        
# average_amount_male_ats = total_amount_male_ats/total_male_tweets
# average_amount_female_ats = total_amount_female_ats/total_female_tweets

# fm3 = average_amount_male_ats
# ff3 = average_amount_female_ats
          
def prediction(tweet):
    ft1 = count_reply_tweet(tweet["text"])
    ft2 = contains_links(tweet["text"])
    ft3 = contains_smiley(tweet["text"])
    ft4 = count_number_verbs(tweet["text"])
    ft5 = count_number_nouns(tweet["text"])
    ft6 = count_tokens(tweet["text"])
    ft7 = num_like(tweet["text"])
    ft8 = positive_scores(tweet["text"])
    ft9 = negative_scores(tweet["text"])
    ft10 = neutral_scores(tweet["text"])
    ft11 = num_first_person(tweet["text"])
    distance_fm1 = abs(fm1 - ft1)
    distance_ff1 = abs(ff1 - ft1) 
    distance_fm2 = abs(fm2 - ft2)
    distance_ff2 = abs(ff2 - ft2)
    distance_fm3 = abs(fm3 - ft3)
    distance_ff3 = abs(ff3 - ft3)
    distance_fm4 = abs(fm4 - ft4)
    distance_ff4 = abs(ff4 - ft4) 
    distance_fm5 = abs(fm5 - ft5)
    distance_ff5 = abs(ff5 - ft5) 
    distance_fm6 = abs(fm6 - ft6)
    distance_ff6 = abs(ff6 - ft6) 
    distance_fm7 = abs(fm7 - ft7)
    distance_ff7 = abs(ff7 - ft7) 
    distance_fm8 = abs(fm8 - ft8)
    distance_ff8 = abs(ff8 - ft8)
    distance_fm9 = abs(fm9 - ft9)
    distance_ff9 = abs(ff9 - ft9)
    distance_fm10 = abs(fm10 - ft10)
    distance_ff10 = abs(ff10 - ft10)
    distance_fm11 = abs(fm11 - ft11)
    distance_ff11 = abs(ff11 - ft11)
    total_distance_m = distance_fm1 + distance_fm2 + distance_fm3 + distance_fm4 + distance_fm5 + distance_fm6 + distance_fm7 + distance_fm8 + distance_fm9*50 + distance_fm10 + distance_fm11*10
    total_distance_f = distance_ff1 + distance_ff2 + distance_ff3 + distance_ff4 + distance_ff5 + distance_ff6 + distance_ff7 + distance_ff8 + distance_ff9*50 + distance_ff10 + distance_ff11*10
    
    if total_distance_m > total_distance_f:
        prediction = "female"
    if total_distance_f > total_distance_m:
        prediction = "male"
    
    return(prediction, total_distance_m, total_distance_f)

for tweet in updated_validation_list:
    print("gold: ", tweet["gender"], "\t", "prediction: ", prediction(tweet))

    
    

gold:  male 	 prediction:  ('female', 39.4304675893887, 34.526652878464795)
gold:  male 	 prediction:  ('male', 37.334742099192624, 41.440985501066116)
gold:  male 	 prediction:  ('male', 36.66030519031142, 39.63991428571427)
gold:  male 	 prediction:  ('male', 46.540815916955026, 50.22661876332624)
gold:  male 	 prediction:  ('male', 39.04603160322953, 42.918631556503215)
gold:  male 	 prediction:  ('female', 59.09195086505191, 54.71497270788913)
gold:  male 	 prediction:  ('female', 59.01617600922722, 55.44191002132197)
gold:  male 	 prediction:  ('male', 40.374742099192616, 44.48098550106612)
gold:  male 	 prediction:  ('male', 34.90611003460207, 35.67985970149255)
gold:  male 	 prediction:  ('male', 42.877539331026526, 47.41128742004266)
gold:  male 	 prediction:  ('male', 39.7952553633218, 42.7627317697228)
gold:  male 	 prediction:  ('male', 42.98353448673588, 45.88526055437099)
gold:  male 	 prediction:  ('female', 48.80951257208767, 44.17911343283583)
gold:  male 	 prediction: 

KeyboardInterrupt: 