In [1]:
import pandas as pd
import chardet as chardet

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# GPA Data Cleaning

In [2]:
input_file = 'gpa.csv'
output_file = 'gpa_utf8.csv'

# Read the file in UTF-16 encoding and fix irregular lines
lines = []
with open(input_file, 'r', encoding='utf-16') as infile:
    for line in infile:
        # Split the line into fields and fix the irregular line if needed
        fields = line.strip().split(',')
        if len(fields) != 2:
            #print(f"Fixing line: {line.strip()}")
            # Example fix: join extra fields
            fields = [','.join(fields[:2])] + fields[2:]
        lines.append(','.join(fields))

# Write the fixed lines to a new file in UTF-8 encoding
with open(output_file, 'w', encoding='utf-8') as outfile:
    for line in lines:
        outfile.write(line + '\n')

print(f"File converted to UTF-8 and saved as {output_file}")

File converted to UTF-8 and saved as gpa_utf8.csv


In [3]:
gpa_df = pd.read_csv("gpa_utf8.csv", sep="\t")

In [4]:
gpa_df_usa = gpa_df[gpa_df["City"] != 'n/a ']
gpa_df_usa = gpa_df_usa.drop(columns=["Calculation1"])

In [5]:
def diff_gpa(mini_df):
    return mini_df[mini_df['Measure Names'] == 'Enr GPA']['Measure Values']
    
#gpa_df_usa.groupby("School").agg(diff_gpa)

In [6]:
enrl_gpa_df_usa = gpa_df_usa[gpa_df_usa["Measure Names"] == "Enrl GPA"]

adm_gpa_df_usa = gpa_df_usa[gpa_df_usa["Measure Names"] == "Adm GPA"]

In [7]:
enrl_gpa_df_usa = enrl_gpa_df_usa.drop(columns = ["Measure Names"])
enrl_gpa_df_usa = enrl_gpa_df_usa.rename(columns={"Measure Values": "Enrl GPA"}).set_index("School")

adm_gpa_df_usa = adm_gpa_df_usa.drop(columns = ["Measure Names"])
adm_gpa_df_usa = adm_gpa_df_usa.rename(columns={"Measure Values": "Adm GPA"}).set_index("School")

In [8]:
enrl_gpa_df_usa["Adm GPA"] = adm_gpa_df_usa["Adm GPA"]

def calc_gpa_diff(ser):
    return ser["Adm GPA"] - ser["Enrl GPA"]

enrl_gpa_df_usa["GPA Diff"] = enrl_gpa_df_usa.apply(calc_gpa_diff, axis=1)
enrl_gpa_df_usa["GPA Diff"]

School
A & M CONSOLIDATED HIGH SCHOOL         NaN
A B MILLER HIGH SCHOOL                 NaN
A N MCCALLUM HIGH SCHOOL               NaN
ABINGTON FRIENDS SCHOOL                NaN
ABRAHAM LINCOLN HIGH SCHOOL       0.001727
                                    ...   
YUCAIPA SENIOR HIGH SCHOOL             NaN
YUCCA VALLEY HIGH SCHOOL               NaN
YULA BOYS HIGH SCHOOL                  NaN
YUMA CATHOLIC HIGH SCHOOL              NaN
ZIONSVILLE COMMUNITY HS                NaN
Name: GPA Diff, Length: 2759, dtype: float64

In [9]:
enrl_gpa_df_usa

Unnamed: 0_level_0,City,County/State/Country,Enrl GPA,Adm GPA,GPA Diff
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A & M CONSOLIDATED HIGH SCHOOL,College Station,Texas,,,
A B MILLER HIGH SCHOOL,Fontana,San Bernardino,,4.090000,
A N MCCALLUM HIGH SCHOOL,Austin,Texas,,4.228333,
ABINGTON FRIENDS SCHOOL,Jenkintown,Pennsylvania,,,
ABRAHAM LINCOLN HIGH SCHOOL,Los Angeles,Los Angeles,4.081176,4.082903,0.001727
...,...,...,...,...,...
YUCAIPA SENIOR HIGH SCHOOL,Yucaipa,San Bernardino,,4.274615,
YUCCA VALLEY HIGH SCHOOL,Yucca Valley,San Bernardino,,,
YULA BOYS HIGH SCHOOL,Los Angeles,Los Angeles,,,
YUMA CATHOLIC HIGH SCHOOL,Yuma,Arizona,,4.282000,


# Admittance Data Cleaning

In [10]:
input_file = 'admittance.csv'
output_file = 'admittance_utf8.csv'

# Read the file in UTF-16 encoding and fix irregular lines
lines = []
with open(input_file, 'r', encoding='utf-16') as infile:
    for line in infile:
        # Split the line into fields and fix the irregular line if needed
        fields = line.strip().split(',')
        if len(fields) != 2:
            #print(f"Fixing line: {line.strip()}")
            # Example fix: join extra fields
            fields = [','.join(fields[:2])] + fields[2:]
        lines.append(','.join(fields))

# Write the fixed lines to a new file in UTF-8 encoding
with open(output_file, 'w', encoding='utf-8') as outfile:
    for line in lines:
        outfile.write(line + '\n')

print(f"File converted to UTF-8 and saved as {output_file}")

File converted to UTF-8 and saved as admittance_utf8.csv


In [11]:
admittance_df = pd.read_csv("admittance_utf8.csv", sep="\t")
admittance_df

Unnamed: 0,Calculation1,City,County/State/ Territory,Count,gender,School,Pivot Field Values
0,2ND NATURE ACADEMY HIGH SCHOOL300024,Nashua,NH,Enr,Unknown,2ND NATURE ACADEMY HIGH SCHOOL,
1,2ND NATURE ACADEMY HIGH SCHOOL300024,Nashua,NH,Adm,Unknown,2ND NATURE ACADEMY HIGH SCHOOL,
2,2ND NATURE ACADEMY HIGH SCHOOL300024,Nashua,NH,App,Unknown,2ND NATURE ACADEMY HIGH SCHOOL,
3,2ND NATURE ACADEMY HIGH SCHOOL300024,Nashua,NH,Enr,Other,2ND NATURE ACADEMY HIGH SCHOOL,
4,2ND NATURE ACADEMY HIGH SCHOOL300024,Nashua,NH,Adm,Other,2ND NATURE ACADEMY HIGH SCHOOL,
...,...,...,...,...,...,...,...
120259,ZURICH INTERNATIONAL SCHOOL796330,,,Adm,Female,,
120260,ZURICH INTERNATIONAL SCHOOL796330,,,App,Female,,
120261,ZURICH INTERNATIONAL SCHOOL796330,,,Enr,All,,
120262,ZURICH INTERNATIONAL SCHOOL796330,,,Adm,All,,


In [12]:
admittance_df_usa = admittance_df[admittance_df["City"] != 'n/a ']
admittance_df_usa = admittance_df[admittance_df["City"].isna() == False]
admittance_df_usa = admittance_df_usa.drop(columns=["Calculation1"])
admittance_df_usa = admittance_df_usa[admittance_df["gender"] == "All"]

  admittance_df_usa = admittance_df_usa[admittance_df["gender"] == "All"]


In [13]:
admittance_df_usa.iloc[:20]

Unnamed: 0,City,County/State/ Territory,Count,gender,School,Pivot Field Values
9,Nashua,NH,Enr,All,2ND NATURE ACADEMY HIGH SCHOOL,
10,Nashua,NH,Adm,All,2ND NATURE ACADEMY HIGH SCHOOL,
11,Nashua,NH,App,All,2ND NATURE ACADEMY HIGH SCHOOL,
21,West Chester,PA,Enr,All,21ST CENTURY CYBER CHARTER SCH,
22,West Chester,PA,Adm,All,21ST CENTURY CYBER CHARTER SCH,
23,West Chester,PA,App,All,21ST CENTURY CYBER CHARTER SCH,
51,College Station,TX,Enr,All,A & M CONSOLIDATED HIGH SCHOOL,
52,College Station,TX,Adm,All,A & M CONSOLIDATED HIGH SCHOOL,3.0
53,College Station,TX,App,All,A & M CONSOLIDATED HIGH SCHOOL,5.0
78,Fontana,San Bernardino,Enr,All,A B MILLER HIGH SCHOOL,


In [41]:
enr_admittance_df_usa = admittance_df_usa[admittance_df_usa["Count"] == "Enr"].set_index("School")
adm_admittance_df_usa = admittance_df_usa[admittance_df_usa["Count"] == "Adm"].set_index("School")
final_admittance_df_usa = admittance_df_usa[admittance_df_usa["Count"] == "App"]
final_admittance_df_usa = final_admittance_df_usa.rename(columns={"Pivot Field Values": "App Ct"}).set_index("School")

In [42]:
final_admittance_df_usa["Adm Ct"] = adm_admittance_df_usa["Pivot Field Values"]
final_admittance_df_usa["Enr Ct"] = enr_admittance_df_usa["Pivot Field Values"]

In [43]:
final_admittance_df_usa

Unnamed: 0_level_0,City,County/State/ Territory,Count,gender,App Ct,Adm Ct,Enr Ct
School,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2ND NATURE ACADEMY HIGH SCHOOL,Nashua,NH,App,All,,,
21ST CENTURY CYBER CHARTER SCH,West Chester,PA,App,All,,,
A & M CONSOLIDATED HIGH SCHOOL,College Station,TX,App,All,5.0,3.0,
A B MILLER HIGH SCHOOL,Fontana,San Bernardino,App,All,32.0,13.0,
A BEKA ACADEMY VIDEO FOR HOMES,Pensacola,FL,App,All,,,
...,...,...,...,...,...,...,...
YUCAIPA SENIOR HIGH SCHOOL,Yucaipa,San Bernardino,App,All,49.0,13.0,4.0
YUCCA VALLEY HIGH SCHOOL,Yucca Valley,San Bernardino,App,All,14.0,,
YULA BOYS HIGH SCHOOL,Los Angeles,Los Angeles,App,All,11.0,3.0,
YUMA CATHOLIC HIGH SCHOOL,Yuma,AZ,App,All,6.0,5.0,
