# Cleaning data from the Enron Dataset for Gender Analysis  

Original Files can be found at https://www.kaggle.com/amank56/enron-clean-dataset

In [3]:
import pandas as pd
!pip install gender-guesser
import gender_guesser.detector as gender#!pip install nltk
import nltk
nltk.download('punkt')
  

Collecting gender-guesser
  Downloading gender_guesser-0.4.0-py2.py3-none-any.whl (379 kB)
Installing collected packages: gender-guesser
Successfully installed gender-guesser-0.4.0


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\baile\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [5]:
email_df_0 = pd.read_csv('emaildata0.csv')
email_df_1 = pd.read_csv('emaildata1.csv')
email_df_2 = pd.read_csv('emaildata2.csv')
email_df_3 = pd.read_csv('emaildata3.csv')
email_df_4 = pd.read_csv('emaildata4.csv')
email_df_5 = pd.read_csv('emaildata5.csv')


In [6]:
#combine all of the data into one dataframe 
emails_df = email_df_0.append(email_df_1).append(email_df_2).append(email_df_3).append(email_df_4).append(email_df_5)
emails_df.drop(columns = 'Unnamed: 0', inplace = True)
emails_df.reset_index(inplace = True)

In [7]:
emails_df.head()

Unnamed: 0,index,date,sender,recipient1,subject,text
0,0,2001-05-14 16:39:00-07:00,phillip.allen@enron.com,tim.belden@enron.com,,"['', 'Here is our forecast', '', ' ']"
1,1,2001-05-04 13:51:00-07:00,phillip.allen@enron.com,john.lavorato@enron.com,Re:,"['', 'Traveling to have a business meeting tak..."
2,2,2000-10-18 03:00:00-07:00,phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,"['', 'test successful. way to go!!!']"
3,3,2000-10-23 06:13:00-07:00,phillip.allen@enron.com,randall.gay@enron.com,,"['', 'Randy,', '', ' Can you send me a schedul..."
4,4,2000-08-31 05:07:00-07:00,phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,"['', ""Let's shoot for Tuesday at 11:45. ""]"


Drop any emails that were not sent to people with @enron.com - we only want to look at emails within the company

In [8]:
sent_within_company = [(emails_df.loc[row, 'recipient1'][-9:] == 'enron.com') & ((emails_df.loc[row, 'sender'][-9:] == 'enron.com')) for row in range(len(emails_df))]

it looks like only about 68% of the emails were sent within the company - we are only going to use those 68% for now

In [9]:
sum(sent_within_company) / len(emails_df)

0.6829996452922398

In [13]:
emails_within_company = emails_df[sent_within_company]

In [14]:
emails_within_company.loc[:, 'sender_name'] = [email.split('.')[0] for email in emails_within_company.loc[:, 'sender']]
emails_within_company.loc[:, 'recipient_name'] = [email.split('.')[0] for email in emails_within_company.loc[:, 'recipient1']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Drop any names that have an at symbol in them - we assume that these are with IT desks, help desks, or other services such as this, and we won't be able to pull a gender from that anyway

In [15]:
intra_employee_emails = emails_within_company[[('@' not in email) for email in emails_within_company['sender_name']]]
intra_employee_emails = intra_employee_emails[[('@' not in email) for email in intra_employee_emails['recipient_name']]]

intra_employee_emails.head()

Unnamed: 0,index,date,sender,recipient1,subject,text,sender_name,recipient_name
0,0,2001-05-14 16:39:00-07:00,phillip.allen@enron.com,tim.belden@enron.com,,"['', 'Here is our forecast', '', ' ']",phillip,tim
1,1,2001-05-04 13:51:00-07:00,phillip.allen@enron.com,john.lavorato@enron.com,Re:,"['', 'Traveling to have a business meeting tak...",phillip,john
2,2,2000-10-18 03:00:00-07:00,phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,"['', 'test successful. way to go!!!']",phillip,leah
3,3,2000-10-23 06:13:00-07:00,phillip.allen@enron.com,randall.gay@enron.com,,"['', 'Randy,', '', ' Can you send me a schedul...",phillip,randall
4,4,2000-08-31 05:07:00-07:00,phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,"['', ""Let's shoot for Tuesday at 11:45. ""]",phillip,greg


## Parse the sender and recipient emails to pull off just their first names

First, We will drop any of the rows that have a sender or recipient name of a single letter - our gender guesser won't be able to figure out the gender accurately for these

In [16]:
intra_employee_emails = intra_employee_emails[[(len(name) >= 2) for name in intra_employee_emails['sender_name']]]
intra_employee_emails = intra_employee_emails[[(len(name) >= 2) for name in intra_employee_emails['recipient_name']]]

Next, we will make the first letter of each name uppercase and the rest lowercase-  that is what the gender guesser function requires

In [17]:
intra_employee_emails['sender_name'] = [(name[0].upper() + name[1:].lower()) for name in intra_employee_emails['sender_name']]
intra_employee_emails['recipient_name'] = [(name[0].upper() + name[1:].lower()) for name in intra_employee_emails['recipient_name']]

In [18]:
intra_employee_emails.head()

Unnamed: 0,index,date,sender,recipient1,subject,text,sender_name,recipient_name
0,0,2001-05-14 16:39:00-07:00,phillip.allen@enron.com,tim.belden@enron.com,,"['', 'Here is our forecast', '', ' ']",Phillip,Tim
1,1,2001-05-04 13:51:00-07:00,phillip.allen@enron.com,john.lavorato@enron.com,Re:,"['', 'Traveling to have a business meeting tak...",Phillip,John
2,2,2000-10-18 03:00:00-07:00,phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,"['', 'test successful. way to go!!!']",Phillip,Leah
3,3,2000-10-23 06:13:00-07:00,phillip.allen@enron.com,randall.gay@enron.com,,"['', 'Randy,', '', ' Can you send me a schedul...",Phillip,Randall
4,4,2000-08-31 05:07:00-07:00,phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,"['', ""Let's shoot for Tuesday at 11:45. ""]",Phillip,Greg


## Next, we will add a label with our best guess of what the sender's and recipient's genders were 

Gender Guesser Documentation 


https://pypi.org/project/gender-guesser/

In [21]:
d = gender.Detector()

intra_employee_emails.loc[:, 'sender_gender'] = [d.get_gender(name, 'usa') for name in intra_employee_emails['sender_name']]
intra_employee_emails.loc[:, 'recipient_gender'] = [d.get_gender(name, 'usa') for name in intra_employee_emails['recipient_name']]

#drop the ones that have an unknown gender - they are names like "Public Relations", "Energy", etc
intra_employee_emails = intra_employee_emails[(intra_employee_emails['sender_gender'] != 'unknown') & (intra_employee_emails['recipient_gender'] != 'unknown')]

In [22]:
intra_employee_emails.head()

Unnamed: 0,index,date,sender,recipient1,subject,text,sender_name,recipient_name,sender_gender,recipient_gender
0,0,2001-05-14 16:39:00-07:00,phillip.allen@enron.com,tim.belden@enron.com,,"['', 'Here is our forecast', '', ' ']",Phillip,Tim,male,male
1,1,2001-05-04 13:51:00-07:00,phillip.allen@enron.com,john.lavorato@enron.com,Re:,"['', 'Traveling to have a business meeting tak...",Phillip,John,male,male
2,2,2000-10-18 03:00:00-07:00,phillip.allen@enron.com,leah.arsdall@enron.com,Re: test,"['', 'test successful. way to go!!!']",Phillip,Leah,male,female
3,3,2000-10-23 06:13:00-07:00,phillip.allen@enron.com,randall.gay@enron.com,,"['', 'Randy,', '', ' Can you send me a schedul...",Phillip,Randall,male,male
4,4,2000-08-31 05:07:00-07:00,phillip.allen@enron.com,greg.piper@enron.com,Re: Hello,"['', ""Let's shoot for Tuesday at 11:45. ""]",Phillip,Greg,male,male


In [25]:
accepted_chars = ['!', ' ', '.', '?']
#remove any characters besides certain punctuation, letters, and numbers
# we are doing this because the kaggle link where we downloaded the data from was parsed, but Python read it as a string - we will get rid of the extra commas and brackets
def remove_unwanted_chars(text):
    return (''.join(ch for ch in text if (ch.isalnum() or ch in accepted_chars))).strip(' ')


intra_employee_emails['clean_text'] = intra_employee_emails['text'].apply(remove_unwanted_chars)

In [None]:
#output the clean data to a csv file
intra_employee_emails.to_csv('/content/drive/Shared drives/TAMU-Datathon-2020/clean_enron_emails.csv')

In [None]:
#project analysis can be found in enron_gender_analysis.ipynb 