In [45]:
import pandas as pd
import numpy as np
import string
import datetime

In [46]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [47]:
df = pd.read_parquet('./Medallion Architecture/bronze/bronze_transactions.parquet')


In [49]:
def drop_null_columns(df, threshold=0.2):
    null_percentage = df.isnull().mean()
    columns_to_keep = null_percentage[null_percentage < threshold].index

    if len(columns_to_keep) == len(df.columns):
        print('No columns removed')
    else:
        print(f'removing: {[c for c in df.columns if c not in columns_to_keep]}')
    return df[columns_to_keep]



In [68]:
def fill_missing_values(df, columns_defaults: dict):
    for column, default_value in columns_defaults.items():
        df[column] = df[column].fillna(default_value)
    return df


In [69]:
def convert_column_types(df, columns_types: dict):
    try:
        for column, dtype in columns_types.items():
            df[column] = df[column].astype(dtype)
        return df
    except Exception as e:
        print(f'{column} caused an issue')
        raise e

In [70]:
def remove_punctuation(df, columns: list):
    for c in columns:
        df.loc[:,c] = df[c].str.replace(r'[^\w\s]|_', '', regex=True)
        return df

In [71]:
def check_formats(df, expected_formats: dict):
    incorrect_formats = []
    for column, datatype in df.dtypes.to_dict().items():
        expected_type = expected_formats.get(column)
        if expected_type != datatype:
            incorrect_formats.append((column, datatype, expected_type))

    incorrect_columns = [c[0] for c in incorrect_formats]
    correct_format_count = len([c for c in df.columns if c not in incorrect_columns])
    if incorrect_formats:
        print('Below are incorrect formats')
        print('-' * 50)
        print(f'Correct Column Count {correct_format_count}')
        return pd.DataFrame(incorrect_formats, columns=['Column', 'Actual', 'Expected'])
    else:
        print('Validation Complete, no discrepencies')
        

In [73]:
def check_similarity(word1: str, word2: str) -> float:
    word_set1 = set(word1)
    word_set2 = set(word2)

    intersection = word_set1.intersection(word_set2)
    intersection_count = len(intersection)

    total_char_count = len(word_set1.union(word_set2))

    similarity = intersection_count / total_char_count
    return similarity


In [74]:
def check_mispelling(dataframe: pd.DataFrame, column: str, similarity_threshold: float) -> pd.DataFrame:
    all_unique_values = list(set(dataframe[column].tolist()))

    similarity_list = []
    for n in range(len(all_unique_values)):
        value1 = all_unique_values[n]
        for n2 in range(n + 1, len(all_unique_values)):
            value2 = all_unique_values[n2]
            similarity = round(check_similarity(value1, value2), 4)
            if similarity >= similarity_threshold:
                similarity_list.append([value1, value2, similarity])
    return pd.DataFrame(similarity_list, columns=['name1', 'name2', 'similarity'])


In [75]:
df.head()

Unnamed: 0,Student Name,House,Year,Wand Type,Pet,Potions Grade,Defense Against the Dark Arts Grade,Transfiguration Grade,Spells Learned,Quidditch Position,Points Earned for House,Detentions,Participation in Dueling Club,Triwizard Tournament Involvement,Items Owned,Knuts Spent in Hogsmeade,Attendance at Classes,Magical Accidents,fileName,loadDatetimeStamp
0,Seamus Potter,Slytherin,2,"9 inches, elm, veela hair core",Dragon,Outstanding,Exceeds Expectations,Outstanding,26,Beater,-100,3,No,No,"Time-Turner, Marauder's Map",88,74,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
1,Ginny Spinnet,Gryffindor,6,"9 inches, elm, veela hair core",Cat,Exceeds Expectations,Acceptable,Outstanding,11,Chaser,139,2,No,No,"Firebolt, Sneakoscope, Extendable Ears",99,67,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
2,Padma Parkinson,Gryffindor,4,"11 inches, holly, phoenix feather core",Muggle Born,Outstanding,Acceptable,Outstanding,19,Seeker,93,2,No,No,Invisibility Cloak,342,88,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
3,,Ravenclaw,1,"12 inches, yew, unicorn hair core",Toad,Exceeds Expectations,Exceeds Expectations,Exceeds Expectations,16,Seeker,-73,2,No,,Time-Turner,233,45,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
4,,Ravenclaw,1,"10 inches, oak, dragon heartstring core",Owl,,Outstanding,Exceeds Expectations,9,Keeper,90,0,No,Yes,"Marauder's Map, Sneakoscope",473,27,7,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128


In [76]:
df.isnull().sum()

Student Name                           13440
House                                   4000
Year                                       0
Wand Type                                  0
Pet                                     7946
Potions Grade                           3440
Defense Against the Dark Arts Grade        0
Transfiguration Grade                      0
Spells Learned                             0
Quidditch Position                      9504
Points Earned for House                    0
Detentions                                 0
Participation in Dueling Club              0
Triwizard Tournament Involvement        5920
Items Owned                                0
Knuts Spent in Hogsmeade                   0
Attendance at Classes                      0
Magical Accidents                          0
fileName                                   0
loadDatetimeStamp                          0
dtype: int64

In [77]:
df.isnull().mean() * 100

Student Name                           28.000000
House                                   8.333333
Year                                    0.000000
Wand Type                               0.000000
Pet                                    16.554167
Potions Grade                           7.166667
Defense Against the Dark Arts Grade     0.000000
Transfiguration Grade                   0.000000
Spells Learned                          0.000000
Quidditch Position                     19.800000
Points Earned for House                 0.000000
Detentions                              0.000000
Participation in Dueling Club           0.000000
Triwizard Tournament Involvement       12.333333
Items Owned                             0.000000
Knuts Spent in Hogsmeade                0.000000
Attendance at Classes                   0.000000
Magical Accidents                       0.000000
fileName                                0.000000
loadDatetimeStamp                       0.000000
dtype: float64

In [78]:
drop_null_columns(df, threshold=.3)

No columns removed


Unnamed: 0,Student Name,House,Year,Wand Type,Pet,Potions Grade,Defense Against the Dark Arts Grade,Transfiguration Grade,Spells Learned,Quidditch Position,Points Earned for House,Detentions,Participation in Dueling Club,Triwizard Tournament Involvement,Items Owned,Knuts Spent in Hogsmeade,Attendance at Classes,Magical Accidents,fileName,loadDatetimeStamp
0,Seamus Potter,Slytherin,2,"9 inches, elm, veela hair core",Dragon,Outstanding,Exceeds Expectations,Outstanding,26,Beater,-100,3,No,No,"Time-Turner, Marauder's Map",88,74,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
1,Ginny Spinnet,Gryffindor,6,"9 inches, elm, veela hair core",Cat,Exceeds Expectations,Acceptable,Outstanding,11,Chaser,139,2,No,No,"Firebolt, Sneakoscope, Extendable Ears",99,67,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
2,Padma Parkinson,Gryffindor,4,"11 inches, holly, phoenix feather core",Muggle Born,Outstanding,Acceptable,Outstanding,19,Seeker,93,2,No,No,Invisibility Cloak,342,88,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
3,,Ravenclaw,1,"12 inches, yew, unicorn hair core",Toad,Exceeds Expectations,Exceeds Expectations,Exceeds Expectations,16,Seeker,-73,2,No,,Time-Turner,233,45,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
4,,Ravenclaw,1,"10 inches, oak, dragon heartstring core",Owl,,Outstanding,Exceeds Expectations,9,Keeper,90,0,No,Yes,"Marauder's Map, Sneakoscope",473,27,7,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,Susan Potter,Slytherin,1,"11 inches, holly, phoenix feather core",,Outstanding,Outstanding,Exceeds Expectations,24,Seeker,90,5,Yes,Yes,"Marauder's Map, Firebolt",399,40,1,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47996,Alicyia Boot,,4,"11 inches, holly, phoenix feather core",Toad,,Poor,Poor,28,Seeker,-9,1,Yes,Yes,"Extendable Ears, Marauder's Map, Invisibility ...",299,27,5,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47997,Terry Granger,Ravenclaw,7,"9 inches, elm, veela hair core",Owl,Outstanding,Poor,Exceeds Expectations,12,Seeker,-47,4,Yes,No,"Sneakoscope, Firebolt",124,27,10,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47998,Ron Bell,Hufflepuff,1,"9 inches, elm, veela hair core",Dragon,Exceeds Expectations,Acceptable,Acceptable,11,Chaser,75,5,Yes,No,"Firebolt, Sneakoscope",362,49,7,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827


In [79]:
fill_missing_values(df, {'Student Name': 'Unknown Student', 'House':'Unknown House', 'Pet': 'Unknown Pet', 'Potions Grade': -1, 'Quidditch Position': 'Uknown Quidditch','Triwizard Tournament Involvement': 'Unknown Triwizard'})

Unnamed: 0,Student Name,House,Year,Wand Type,Pet,Potions Grade,Defense Against the Dark Arts Grade,Transfiguration Grade,Spells Learned,Quidditch Position,Points Earned for House,Detentions,Participation in Dueling Club,Triwizard Tournament Involvement,Items Owned,Knuts Spent in Hogsmeade,Attendance at Classes,Magical Accidents,fileName,loadDatetimeStamp
0,Seamus Potter,Slytherin,2,"9 inches, elm, veela hair core",Dragon,Outstanding,Exceeds Expectations,Outstanding,26,Beater,-100,3,No,No,"Time-Turner, Marauder's Map",88,74,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
1,Ginny Spinnet,Gryffindor,6,"9 inches, elm, veela hair core",Cat,Exceeds Expectations,Acceptable,Outstanding,11,Chaser,139,2,No,No,"Firebolt, Sneakoscope, Extendable Ears",99,67,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
2,Padma Parkinson,Gryffindor,4,"11 inches, holly, phoenix feather core",Muggle Born,Outstanding,Acceptable,Outstanding,19,Seeker,93,2,No,No,Invisibility Cloak,342,88,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
3,Unknown Student,Ravenclaw,1,"12 inches, yew, unicorn hair core",Toad,Exceeds Expectations,Exceeds Expectations,Exceeds Expectations,16,Seeker,-73,2,No,Unknown Triwizard,Time-Turner,233,45,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
4,Unknown Student,Ravenclaw,1,"10 inches, oak, dragon heartstring core",Owl,-1,Outstanding,Exceeds Expectations,9,Keeper,90,0,No,Yes,"Marauder's Map, Sneakoscope",473,27,7,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,Susan Potter,Slytherin,1,"11 inches, holly, phoenix feather core",Unknown Pet,Outstanding,Outstanding,Exceeds Expectations,24,Seeker,90,5,Yes,Yes,"Marauder's Map, Firebolt",399,40,1,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47996,Alicyia Boot,Unknown House,4,"11 inches, holly, phoenix feather core",Toad,-1,Poor,Poor,28,Seeker,-9,1,Yes,Yes,"Extendable Ears, Marauder's Map, Invisibility ...",299,27,5,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47997,Terry Granger,Ravenclaw,7,"9 inches, elm, veela hair core",Owl,Outstanding,Poor,Exceeds Expectations,12,Seeker,-47,4,Yes,No,"Sneakoscope, Firebolt",124,27,10,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47998,Ron Bell,Hufflepuff,1,"9 inches, elm, veela hair core",Dragon,Exceeds Expectations,Acceptable,Acceptable,11,Chaser,75,5,Yes,No,"Firebolt, Sneakoscope",362,49,7,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827


In [80]:
df.isnull().mean()*100

Student Name                           0.0
House                                  0.0
Year                                   0.0
Wand Type                              0.0
Pet                                    0.0
Potions Grade                          0.0
Defense Against the Dark Arts Grade    0.0
Transfiguration Grade                  0.0
Spells Learned                         0.0
Quidditch Position                     0.0
Points Earned for House                0.0
Detentions                             0.0
Participation in Dueling Club          0.0
Triwizard Tournament Involvement       0.0
Items Owned                            0.0
Knuts Spent in Hogsmeade               0.0
Attendance at Classes                  0.0
Magical Accidents                      0.0
fileName                               0.0
loadDatetimeStamp                      0.0
dtype: float64

In [81]:
remove_punctuation(df, ['Quidditch Position'])

Unnamed: 0,Student Name,House,Year,Wand Type,Pet,Potions Grade,Defense Against the Dark Arts Grade,Transfiguration Grade,Spells Learned,Quidditch Position,Points Earned for House,Detentions,Participation in Dueling Club,Triwizard Tournament Involvement,Items Owned,Knuts Spent in Hogsmeade,Attendance at Classes,Magical Accidents,fileName,loadDatetimeStamp
0,Seamus Potter,Slytherin,2,"9 inches, elm, veela hair core",Dragon,Outstanding,Exceeds Expectations,Outstanding,26,Beater,-100,3,No,No,"Time-Turner, Marauder's Map",88,74,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
1,Ginny Spinnet,Gryffindor,6,"9 inches, elm, veela hair core",Cat,Exceeds Expectations,Acceptable,Outstanding,11,Chaser,139,2,No,No,"Firebolt, Sneakoscope, Extendable Ears",99,67,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
2,Padma Parkinson,Gryffindor,4,"11 inches, holly, phoenix feather core",Muggle Born,Outstanding,Acceptable,Outstanding,19,Seeker,93,2,No,No,Invisibility Cloak,342,88,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
3,Unknown Student,Ravenclaw,1,"12 inches, yew, unicorn hair core",Toad,Exceeds Expectations,Exceeds Expectations,Exceeds Expectations,16,Seeker,-73,2,No,Unknown Triwizard,Time-Turner,233,45,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
4,Unknown Student,Ravenclaw,1,"10 inches, oak, dragon heartstring core",Owl,-1,Outstanding,Exceeds Expectations,9,Keeper,90,0,No,Yes,"Marauder's Map, Sneakoscope",473,27,7,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,Susan Potter,Slytherin,1,"11 inches, holly, phoenix feather core",Unknown Pet,Outstanding,Outstanding,Exceeds Expectations,24,Seeker,90,5,Yes,Yes,"Marauder's Map, Firebolt",399,40,1,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47996,Alicyia Boot,Unknown House,4,"11 inches, holly, phoenix feather core",Toad,-1,Poor,Poor,28,Seeker,-9,1,Yes,Yes,"Extendable Ears, Marauder's Map, Invisibility ...",299,27,5,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47997,Terry Granger,Ravenclaw,7,"9 inches, elm, veela hair core",Owl,Outstanding,Poor,Exceeds Expectations,12,Seeker,-47,4,Yes,No,"Sneakoscope, Firebolt",124,27,10,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47998,Ron Bell,Hufflepuff,1,"9 inches, elm, veela hair core",Dragon,Exceeds Expectations,Acceptable,Acceptable,11,Chaser,75,5,Yes,No,"Firebolt, Sneakoscope",362,49,7,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827


In [82]:
df.loc[:, 'House'] = df['House'].str.capitalize()

In [83]:
df

Unnamed: 0,Student Name,House,Year,Wand Type,Pet,Potions Grade,Defense Against the Dark Arts Grade,Transfiguration Grade,Spells Learned,Quidditch Position,Points Earned for House,Detentions,Participation in Dueling Club,Triwizard Tournament Involvement,Items Owned,Knuts Spent in Hogsmeade,Attendance at Classes,Magical Accidents,fileName,loadDatetimeStamp
0,Seamus Potter,Slytherin,2,"9 inches, elm, veela hair core",Dragon,Outstanding,Exceeds Expectations,Outstanding,26,Beater,-100,3,No,No,"Time-Turner, Marauder's Map",88,74,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
1,Ginny Spinnet,Gryffindor,6,"9 inches, elm, veela hair core",Cat,Exceeds Expectations,Acceptable,Outstanding,11,Chaser,139,2,No,No,"Firebolt, Sneakoscope, Extendable Ears",99,67,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
2,Padma Parkinson,Gryffindor,4,"11 inches, holly, phoenix feather core",Muggle Born,Outstanding,Acceptable,Outstanding,19,Seeker,93,2,No,No,Invisibility Cloak,342,88,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
3,Unknown Student,Ravenclaw,1,"12 inches, yew, unicorn hair core",Toad,Exceeds Expectations,Exceeds Expectations,Exceeds Expectations,16,Seeker,-73,2,No,Unknown Triwizard,Time-Turner,233,45,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
4,Unknown Student,Ravenclaw,1,"10 inches, oak, dragon heartstring core",Owl,-1,Outstanding,Exceeds Expectations,9,Keeper,90,0,No,Yes,"Marauder's Map, Sneakoscope",473,27,7,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,Susan Potter,Slytherin,1,"11 inches, holly, phoenix feather core",Unknown Pet,Outstanding,Outstanding,Exceeds Expectations,24,Seeker,90,5,Yes,Yes,"Marauder's Map, Firebolt",399,40,1,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47996,Alicyia Boot,Unknown house,4,"11 inches, holly, phoenix feather core",Toad,-1,Poor,Poor,28,Seeker,-9,1,Yes,Yes,"Extendable Ears, Marauder's Map, Invisibility ...",299,27,5,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47997,Terry Granger,Ravenclaw,7,"9 inches, elm, veela hair core",Owl,Outstanding,Poor,Exceeds Expectations,12,Seeker,-47,4,Yes,No,"Sneakoscope, Firebolt",124,27,10,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47998,Ron Bell,Hufflepuff,1,"9 inches, elm, veela hair core",Dragon,Exceeds Expectations,Acceptable,Acceptable,11,Chaser,75,5,Yes,No,"Firebolt, Sneakoscope",362,49,7,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827


In [93]:
expected_formats = {
    'Student Name': 'string',
    'House': 'string',
    'Year': 'int32',
    'Wand Type': 'string',
    'Pet': 'string',
    'Potions Grade': 'string',
    'Defense Against the Dark Arts Grade': 'string',
    'Transfiguration Grade': 'string',
    'Spells Learned': 'int32',
    'Quidditch Position': 'string',
    'Points Earned for House': 'int32',
    'Detentions': 'int32',
    'Participation in Dueling Club': 'string',
    'Triwizard Tournament Involvement': 'string',
    'Items Owned': 'string',
    'Knuts Spent in Hogsmeade': 'int32',
    'Attendance at Classes': 'int32',
    'Magical Accidents': 'int32',
    'fileName': 'string',
    'loadDatetimeStamp': 'datetime64[ns]'
}

In [94]:
check_formats(df, expected_formats=expected_formats)

Below are incorrect formats
--------------------------------------------------
Correct Column Count 16


Unnamed: 0,Column,Actual,Expected
0,Attendance at Classes,int64,int32
1,Magical Accidents,int64,int32
2,fileName,object,string
3,loadDatetimeStamp,datetime64[us],datetime64[ns]


In [96]:
convert_column_types(df, columns_types=expected_formats)

Unnamed: 0,Student Name,House,Year,Wand Type,Pet,Potions Grade,Defense Against the Dark Arts Grade,Transfiguration Grade,Spells Learned,Quidditch Position,Points Earned for House,Detentions,Participation in Dueling Club,Triwizard Tournament Involvement,Items Owned,Knuts Spent in Hogsmeade,Attendance at Classes,Magical Accidents,fileName,loadDatetimeStamp
0,Seamus Potter,Slytherin,2,"9 inches, elm, veela hair core",Dragon,Outstanding,Exceeds Expectations,Outstanding,26,Beater,-100,3,No,No,"Time-Turner, Marauder's Map",88,74,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
1,Ginny Spinnet,Gryffindor,6,"9 inches, elm, veela hair core",Cat,Exceeds Expectations,Acceptable,Outstanding,11,Chaser,139,2,No,No,"Firebolt, Sneakoscope, Extendable Ears",99,67,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
2,Padma Parkinson,Gryffindor,4,"11 inches, holly, phoenix feather core",Muggle Born,Outstanding,Acceptable,Outstanding,19,Seeker,93,2,No,No,Invisibility Cloak,342,88,6,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
3,Unknown Student,Ravenclaw,1,"12 inches, yew, unicorn hair core",Toad,Exceeds Expectations,Exceeds Expectations,Exceeds Expectations,16,Seeker,-73,2,No,Unknown Triwizard,Time-Turner,233,45,5,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
4,Unknown Student,Ravenclaw,1,"10 inches, oak, dragon heartstring core",Owl,-1,Outstanding,Exceeds Expectations,9,Keeper,90,0,No,Yes,"Marauder's Map, Sneakoscope",473,27,7,Hogwarts_Student_Data_20240101.csv,2024-12-13 10:31:43.870128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47995,Susan Potter,Slytherin,1,"11 inches, holly, phoenix feather core",Unknown Pet,Outstanding,Outstanding,Exceeds Expectations,24,Seeker,90,5,Yes,Yes,"Marauder's Map, Firebolt",399,40,1,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47996,Alicyia Boot,Unknown house,4,"11 inches, holly, phoenix feather core",Toad,-1,Poor,Poor,28,Seeker,-9,1,Yes,Yes,"Extendable Ears, Marauder's Map, Invisibility ...",299,27,5,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47997,Terry Granger,Ravenclaw,7,"9 inches, elm, veela hair core",Owl,Outstanding,Poor,Exceeds Expectations,12,Seeker,-47,4,Yes,No,"Sneakoscope, Firebolt",124,27,10,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827
47998,Ron Bell,Hufflepuff,1,"9 inches, elm, veela hair core",Dragon,Exceeds Expectations,Acceptable,Acceptable,11,Chaser,75,5,Yes,No,"Firebolt, Sneakoscope",362,49,7,Hogwarts_Student_Data_20240601.csv,2024-12-13 10:31:44.013827


In [99]:
check_mispelling(df, 'Student Name', .8)

Unnamed: 0,name1,name2,similarity
0,Pansy BelLl,PQansy Bell,0.8182
1,Pansy BelLl,PansVy Bell,0.8182
2,Pansy BelLl,Pansy Bell,0.9000
3,Pansy BelLl,PaUnsy Bell,0.8182
4,Pansy BelLl,Panssy Bell,0.9000
...,...,...,...
27234,Cedriic Granger,Cedric Granjger,0.9167
27235,Alicia PatilG,Alicia Pahtil,0.8000
27236,Fred Llongbottom,Fred Longbottmom,0.9231
27237,Dracon Potter,Dean Poxtter,0.8182


In [100]:
df2 = df.copy()
rename_values = {
    'Pansy BelLl': 'Pansy Parkinson',
    'Alicia PatilG': 'Alicia Patil',
    'Dracon Potter': 'Draco Potter'
}

df2['Student Name'] = df2['Student Name'].replace(rename_values)
sorted(df2['Student Name'].unique())

['ABlicia Potter',
 'ADlicia Parkinson',
 'ADlicia Patil',
 'AEngelina Zabini',
 'AGngelina Patil',
 'AJngelina Parkinson',
 'ALngelina Malfoy',
 'ANngelina Diggory',
 'ATlicia Zabini',
 'AUlicia Potter',
 'AWlicia Lovegood',
 'Aalicia Bones',
 'Acngelina Johnson',
 'Aelicia Finnigan',
 'Aengelina Bell',
 'Afngelina Thomas',
 'AlBicia Diggory',
 'AlIicia Bones',
 'AlIicia Malfoy',
 'AlIicia Zabini',
 'AlJicia Bell',
 'AlWicia Brown',
 'Algicia Thomas',
 'AliBcia Zabini',
 'AliFcia Bell',
 'AliLcia Bell',
 'AliVcia Potter',
 'AlicBia Weasley',
 'AlicCia Patil',
 'AlicKia Weasley',
 'AlicOia Bones',
 'AlicRia Boot',
 'AlicTia Zabini',
 'AlicUia Bell',
 'AlicXia Bell',
 'AlicYia Boot',
 'Alicdia Bones',
 'Alicgia Bell',
 'AliciHa Boot',
 'AliciJa Johnson',
 'AliciKa Chang',
 'AliciLa Granger',
 'AliciPa Granger',
 'AliciUa Chang',
 'AliciVa Potter',
 'AliciZa Potter',
 'Alicia Bell',
 'Alicia BellT',
 'Alicia Bloot',
 'Alicia BonUes',
 'Alicia Bones',
 'Alicia Bonesc',
 'Alicia Bonles',
 

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48000 entries, 0 to 47999
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   Student Name                         48000 non-null  string        
 1   House                                48000 non-null  string        
 2   Year                                 48000 non-null  int32         
 3   Wand Type                            48000 non-null  string        
 4   Pet                                  48000 non-null  string        
 5   Potions Grade                        48000 non-null  string        
 6   Defense Against the Dark Arts Grade  48000 non-null  string        
 7   Transfiguration Grade                48000 non-null  string        
 8   Spells Learned                       48000 non-null  int32         
 9   Quidditch Position                   48000 non-null  string        
 10  Points Ear

In [102]:
check_formats(df, expected_formats=expected_formats)

Validation Complete, no discrepencies


In [105]:
df.to_parquet('./Medallion Architecture/silver/silver_transactions.parquet', index=False)