In [53]:
import boto3
import pandas as pd
import json
from io import StringIO

In [54]:
# Read data
s3_client = boto3.client('s3')
s3_resource = boto3.resource('s3')

# Talent: combined data
response = s3_client.get_object(Bucket='data-504-final-project', Key='Talent_Combined/combined_talent_decision_scores.csv')
content = response['Body'].read().decode('utf-8')
# print(content)

In [55]:
# Create a df
df = pd.read_csv(StringIO(content))
df.head(3)

Unnamed: 0,name,date,strengths,weaknesses,self_development,geo_flex,financial_support_self,result,course_interest,tech_self_score.C#,tech_self_score.Java,tech_self_score.R,tech_self_score.JavaScript,tech_self_score.Python,tech_self_score.C++,tech_self_score.Ruby,tech_self_score.SPSS,tech_self_score.PHP
0,Stillmann Castano,22/08/2019,['Charisma'],"['Distracted', 'Impulsive', 'Introverted']",Yes,Yes,Yes,Pass,Business,6.0,5.0,2.0,2.0,,,,,
1,Hilary Willmore,01/08/2019,"['Patient', 'Curious', 'Problem Solving']","['Overbearing', 'Chatty', 'Indifferent']",No,Yes,Yes,Fail,Data,4.0,2.0,,,1.0,4.0,,,
2,Efrem Whipple,22/08/2019,"['Courteous', 'Independent', 'Patient']","['Introverted', 'Impulsive', 'Anxious']",Yes,Yes,Yes,Pass,Business,,,,,,4.0,4.0,,


In [56]:
# Number of rows
print("Number of rows:")
print(len(df))
print(df.shape)

Number of rows:
1000
(1000, 18)


### Missing values

In [77]:
print(df.isnull().sum())

name                            0
date                            0
strengths                       0
weaknesses                      0
self_development                0
geo_flex                        0
financial_support_self          0
result                          0
course_interest                 0
tech_self_score_CSharp        618
tech_self_score_Java          609
tech_self_score_R             651
tech_self_score_JavaScript    656
tech_self_score_Python        599
tech_self_score_C++           642
tech_self_score_Ruby          633
tech_self_score_SPSS          665
tech_self_score_PHP           674
dtype: int64


### Duplicates

In [58]:
# Detect duplicate rows
duplicates = df.duplicated()
print(f"Number of duplicates: {duplicates.sum()}")

Number of duplicates: 11


In [59]:
# View the duplicate rows
df[duplicates]

Unnamed: 0,name,date,strengths,weaknesses,self_development,geo_flex,financial_support_self,result,course_interest,tech_self_score.C#,tech_self_score.Java,tech_self_score.R,tech_self_score.JavaScript,tech_self_score.Python,tech_self_score.C++,tech_self_score.Ruby,tech_self_score.SPSS,tech_self_score.PHP
59,Pamelina Itzkovsky,13/08/2019,"['Curious', 'Perfectionism', 'Determined']","['Impatient', 'Passive', 'Intolerant']",Yes,Yes,Yes,Pass,Engineering,,2.0,,,,,5.0,,
154,Rafi Andrzejczak,07/08/2019,"['Critical Thinking', 'Innovative']","['Chatty', 'Undisciplined', 'Selfish']",Yes,No,Yes,Fail,Business,,2.0,,,6.0,,,,
211,Redford Bubbings,15/08/2019,"['Rational', 'Curious', 'Courteous']",['Procrastination'],No,Yes,Yes,Fail,Engineering,,4.0,2.0,,,,1.0,,
354,Fee Karpe,17/07/2019,"['Creative', 'Efficient']","['Procrastination', 'Perfectionist']",Yes,Yes,Yes,Pass,Data,,2.0,,,7.0,,2.0,,
466,Mirilla Bottinelli,16/07/2019,['Altruism'],['Chatty'],Yes,Yes,Yes,Pass,Data,1.0,,,,,2.0,4.0,3.0,
700,Fidel Norval,19/07/2019,"['Innovative', 'Determined']","['Selfish', 'Controlling', 'Introverted']",No,Yes,Yes,Fail,Engineering,,,,,,,,4.0,
776,Gavan Soltan,05/07/2019,"['Efficient', 'Empathy', 'Curious']",['Controlling'],Yes,Yes,No,Fail,Engineering,,,4.0,,3.0,,,1.0,
831,Darn Kirkby,18/07/2019,"['Competitive', 'Patient', 'Problem Solving']",['Stubborn'],Yes,Yes,Yes,Pass,Engineering,3.0,,,,,,4.0,4.0,
852,Corby Torel,20/07/2019,['Altruism'],"['Indecisive', 'Slow']",Yes,Yes,No,Fail,Business,,,,6.0,,3.0,,,
888,Kendell Ruppele,06/07/2019,"['Listening', 'Innovative', 'Composure']","['Chaotic', 'Introverted']",Yes,No,Yes,Fail,Engineering,,,3.0,6.0,,,,,


In [60]:
# Remove duplicates
df = df.drop_duplicates()
df.head(3)

Unnamed: 0,name,date,strengths,weaknesses,self_development,geo_flex,financial_support_self,result,course_interest,tech_self_score.C#,tech_self_score.Java,tech_self_score.R,tech_self_score.JavaScript,tech_self_score.Python,tech_self_score.C++,tech_self_score.Ruby,tech_self_score.SPSS,tech_self_score.PHP
0,Stillmann Castano,22/08/2019,['Charisma'],"['Distracted', 'Impulsive', 'Introverted']",Yes,Yes,Yes,Pass,Business,6.0,5.0,2.0,2.0,,,,,
1,Hilary Willmore,01/08/2019,"['Patient', 'Curious', 'Problem Solving']","['Overbearing', 'Chatty', 'Indifferent']",No,Yes,Yes,Fail,Data,4.0,2.0,,,1.0,4.0,,,
2,Efrem Whipple,22/08/2019,"['Courteous', 'Independent', 'Patient']","['Introverted', 'Impulsive', 'Anxious']",Yes,Yes,Yes,Pass,Business,,,,,,4.0,4.0,,


### Data types

In [61]:
print(df.dtypes)

name                           object
date                           object
strengths                      object
weaknesses                     object
self_development               object
geo_flex                       object
financial_support_self         object
result                         object
course_interest                object
tech_self_score.C#            float64
tech_self_score.Java          float64
tech_self_score.R             float64
tech_self_score.JavaScript    float64
tech_self_score.Python        float64
tech_self_score.C++           float64
tech_self_score.Ruby          float64
tech_self_score.SPSS          float64
tech_self_score.PHP           float64
dtype: object


#### Dates

In [62]:
# Date 
# Try converting to datetime (invalid → NaT)
temp = pd.to_datetime(df['date'], format='%d/%m/%Y', errors='coerce')

In [63]:
# Any rows that became NaT are invalid
invalid_rows = df[temp.isna()]
print(invalid_rows[['name', 'date']])

                      name         date
220       Balduin Culleton  13//08/2019
279        Saundra Fuzzens  28//08/2019
408           Hewet Derwin  11//07/2019
588         Joseito Duiged  17//07/2019
601        Arlette Pickles  11//07/2019
723           Jecho Rousel  05//07/2019
760      Homere Stentiford  25//07/2019
786      Hilliard Brockett  05//07/2019
808       Evangelin Gorges  11//07/2019
822  Gerianne Christoforou  13//07/2019
849           Byrom Sawday  11//07/2019
923          Tobe Markovic  05//12/2019
938          Nerti Welfair  11//12/2019


In [64]:
# Fix
df['date'] = df['date'].str.replace('//', '/', regex=False)

In [65]:
# Change to datetime
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y', errors='coerce')
temp = pd.to_datetime(df['date'], format='%d/%m/%Y', errors='coerce')

In [66]:
# Check if fixed
invalid_rows = df[temp.isna()]
print(invalid_rows[['name', 'date']])

Empty DataFrame
Columns: [name, date]
Index: []


#### Booleans

In [67]:
bool_cols = ['self_development', 'geo_flex', 'financial_support_self']
for col in bool_cols:
    df[col] = df[col].map({'Yes': True, 'No': False})

#### Object to Category

In [68]:
# result as categorical
df['result'] = df['result'].astype('category')

In [69]:
# course_interest as categorical 
df['course_interest'] = df['course_interest'].astype('category')

#### Strengths and weaknesses -> CSVs

In [70]:
# strengths and weaknesses to lists
df['strengths'] = df['strengths'].apply(lambda x: ', '.join(x))
df['weaknesses'] = df['weaknesses'].apply(lambda x: ', '.join(x))

#### Rename columns

In [71]:
# Rename tech scores columns
df.columns = df.columns.str.replace('#', 'Sharp').str.replace('.', '_')

### Check

In [72]:
print(df.dtypes)

name                                  object
date                          datetime64[ns]
strengths                             object
weaknesses                            object
self_development                        bool
geo_flex                                bool
financial_support_self                  bool
result                              category
course_interest                     category
tech_self_score_CSharp               float64
tech_self_score_Java                 float64
tech_self_score_R                    float64
tech_self_score_JavaScript           float64
tech_self_score_Python               float64
tech_self_score_C++                  float64
tech_self_score_Ruby                 float64
tech_self_score_SPSS                 float64
tech_self_score_PHP                  float64
dtype: object


#### Cleaned version

In [75]:
df.head(3)

Unnamed: 0,name,date,strengths,weaknesses,self_development,geo_flex,financial_support_self,result,course_interest,tech_self_score_CSharp,tech_self_score_Java,tech_self_score_R,tech_self_score_JavaScript,tech_self_score_Python,tech_self_score_C++,tech_self_score_Ruby,tech_self_score_SPSS,tech_self_score_PHP
0,Stillmann Castano,2019-08-22,Charisma,"Distracted, Impulsive, Introverted",True,True,True,Pass,Business,6.0,5.0,2.0,2.0,,,,,
1,Hilary Willmore,2019-08-01,"Patient, Curious, Problem Solving","Overbearing, Chatty, Indifferent",False,True,True,Fail,Data,4.0,2.0,,,1.0,4.0,,,
2,Efrem Whipple,2019-08-22,"Courteous, Independent, Patient","Introverted, Impulsive, Anxious",True,True,True,Pass,Business,,,,,,4.0,4.0,,


### Back to csv

In [78]:
df.to_csv('cleaned_talent_decision_scores.csv', index=False)