In [3]:
# I want to combine the 3 academy raw files into one file 
import boto3
import pandas as pd
s3 = boto3.client('s3')
bucket_name = 'data-eng-210-final-project'
folder_name = 'Academy'

In [5]:
objects = s3.list_objects_v2(Bucket=bucket_name, Prefix=folder_name)

In [6]:
dfs = []
for obj in objects['Contents']:
    key = obj['Key']
    if key.endswith('.csv'):
        obj = s3.get_object(Bucket=bucket_name, Key=key)
        df = pd.read_csv(obj['Body'])
        dfs.append(df)

In [7]:
# concatenate all dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

In [8]:
combined_df

Unnamed: 0,name,trainer,Analytic_W1,Independent_W1,Determined_W1,Professional_W1,Studious_W1,Imaginative_W1,Analytic_W2,Independent_W2,...,Determined_W9,Professional_W9,Studious_W9,Imaginative_W9,Analytic_W10,Independent_W10,Determined_W10,Professional_W10,Studious_W10,Imaginative_W10
0,Quintus Penella,Gregor Gomez,1,2,2,1,2,2,,,...,,,,,,,,,,
1,Simon Murrey,Gregor Gomez,6,1,1,2,4,2,3.0,1.0,...,,,,,,,,,,
2,Gustaf Lude,Gregor Gomez,6,4,1,1,2,3,1.0,1.0,...,,,,,,,,,,
3,Yolanda Fosse,Gregor Gomez,2,1,2,3,3,3,4.0,2.0,...,,,,,,,,,,
4,Lynnett Swin,Gregor Gomez,2,2,4,5,1,2,3.0,2.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,Moritz Mosedall,Mohammad Velazquez,1,1,5,1,2,6,5.0,3.0,...,,,,,,,,,,
393,Chaim Inseal,Mohammad Velazquez,1,3,3,4,1,2,3.0,3.0,...,,,,,,,,,,
394,Gertruda Syddie,Mohammad Velazquez,3,1,2,8,1,4,2.0,4.0,...,,,,,,,,,,
395,Thom Derwin,Mohammad Velazquez,3,7,3,3,3,1,2.0,7.0,...,,,,,,,,,,


In [10]:

print(combined_df.head())

              name       trainer  Analytic_W1  Independent_W1  Determined_W1   
0  Quintus Penella  Gregor Gomez            1               2              2  \
1     Simon Murrey  Gregor Gomez            6               1              1   
2      Gustaf Lude  Gregor Gomez            6               4              1   
3    Yolanda Fosse  Gregor Gomez            2               1              2   
4     Lynnett Swin  Gregor Gomez            2               2              4   

   Professional_W1  Studious_W1  Imaginative_W1  Analytic_W2  Independent_W2   
0                1            2               2          NaN             NaN  \
1                2            4               2          3.0             1.0   
2                1            2               3          1.0             1.0   
3                3            3               3          4.0             2.0   
4                5            1               2          3.0             2.0   

   ...  Determined_W9  Professional_W9

In [11]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 62 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              397 non-null    object 
 1   trainer           397 non-null    object 
 2   Analytic_W1       397 non-null    int64  
 3   Independent_W1    397 non-null    int64  
 4   Determined_W1     397 non-null    int64  
 5   Professional_W1   397 non-null    int64  
 6   Studious_W1       397 non-null    int64  
 7   Imaginative_W1    397 non-null    int64  
 8   Analytic_W2       387 non-null    float64
 9   Independent_W2    387 non-null    float64
 10  Determined_W2     387 non-null    float64
 11  Professional_W2   387 non-null    float64
 12  Studious_W2       387 non-null    float64
 13  Imaginative_W2    387 non-null    float64
 14  Analytic_W3       369 non-null    float64
 15  Independent_W3    369 non-null    float64
 16  Determined_W3     369 non-null    float64
 1

In [12]:
# Calculate the number of null values per column
null_values_count = combined_df.isnull().sum()

# Calculate the percentage of null values per column
null_values_percentage = (null_values_count / len(combined_df)) * 100

# Combine the counts and percentages into a single DataFrame
null_values_summary = pd.DataFrame({
    'Null Values Count': null_values_count,
    'Null Values Percentage': null_values_percentage
})

# Display the summary DataFrame
print(null_values_summary)

                  Null Values Count  Null Values Percentage
name                              0                0.000000
trainer                           0                0.000000
Analytic_W1                       0                0.000000
Independent_W1                    0                0.000000
Determined_W1                     0                0.000000
...                             ...                     ...
Independent_W10                 235               59.193955
Determined_W10                  235               59.193955
Professional_W10                235               59.193955
Studious_W10                    235               59.193955
Imaginative_W10                 235               59.193955

[62 rows x 2 columns]


In [13]:
#see if we have duplicate names
duplicate_rows = combined_df[combined_df['name'].duplicated()]
duplicate_names = duplicate_rows['name'].unique()

print("Duplicate names:", list(duplicate_names))

Duplicate names: []


In [16]:
# change name and trainer in lower cases
combined_df['name'] = combined_df['name'].str.lower()
combined_df['trainer'] = combined_df['trainer'].str.lower()


In [19]:
#change columns to be in lower case
combined_df.columns = [col.lower() for col in combined_df.columns]


In [21]:
#to remove.0 from our values
cols_to_convert = [col for col in combined_df.columns if col.startswith(('analytic_', 'independent_', 'determined_', 'professional_', 'studious_', 'imaginative_'))]

combined_df[cols_to_convert] = combined_df[cols_to_convert].astype(str).apply(lambda x: x.str.split('.').str[0])


In [22]:
combined_df

Unnamed: 0,name,trainer,analytic_w1,independent_w1,determined_w1,professional_w1,studious_w1,imaginative_w1,analytic_w2,independent_w2,...,determined_w9,professional_w9,studious_w9,imaginative_w9,analytic_w10,independent_w10,determined_w10,professional_w10,studious_w10,imaginative_w10
0,quintus penella,gregor gomez,1,2,2,1,2,2,,,...,,,,,,,,,,
1,simon murrey,gregor gomez,6,1,1,2,4,2,3,1,...,,,,,,,,,,
2,gustaf lude,gregor gomez,6,4,1,1,2,3,1,1,...,,,,,,,,,,
3,yolanda fosse,gregor gomez,2,1,2,3,3,3,4,2,...,,,,,,,,,,
4,lynnett swin,gregor gomez,2,2,4,5,1,2,3,2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
392,moritz mosedall,mohammad velazquez,1,1,5,1,2,6,5,3,...,,,,,,,,,,
393,chaim inseal,mohammad velazquez,1,3,3,4,1,2,3,3,...,,,,,,,,,,
394,gertruda syddie,mohammad velazquez,3,1,2,8,1,4,2,4,...,,,,,,,,,,
395,thom derwin,mohammad velazquez,3,7,3,3,3,1,2,7,...,,,,,,,,,,
