In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

years = [str(year)+'-'+str(year+1) for year in range(2019,2025) if year != 2020]
prefixes_sufixes = [
    '../../data/interim/***_Alexandr.csv',
    '../../data/interim/***_lili.csv',
    '../../data/interim/***_dongyi.csv',
    '../../data/interim/***_saaransh.csv',
    '../../data/interim/***_jose.csv',
]

We first merge the dataframes horizontally. The result will be one `csv` file per academic year.

In [None]:
for year in years:
    
    print(f'merging dataframes from academic year {year}')
    
    # Load all dataframes of the same year
    dfs = []
    for pre_suf in prefixes_sufixes:
        filename = pre_suf.replace('***',year)
        df = pd.read_csv(filename)
        
        # adding a 'will_remove' column of zeros to Dongyi's df
        if 'will_remove' not in df.columns:
            df.insert(loc = 1 , column = 'will_remove' , value = np.zeros(len(df)))

        dfs.append(df)

    # verify all responseid columns agree

    cols_agree = True
    for i in range(1,len(dfs)):
        if not (dfs[0]['responseid'].equals(dfs[i]['responseid'])):
            print(f'responseid from dataframes {prefixes_sufixes[0]} and {prefixes_sufixes[i]} do not agree')
            cols_agree = False
    else:
        print('  all responseid match')

    # If all responseid columns agree, we can merge

    if cols_agree:
        new_df = dfs[0][['responseid']].copy()
        new_df['will_remove'] = pd.concat([df['will_remove'].astype(int) for df in dfs] , axis = 1).max(axis = 1)
        new_df = pd.concat( [new_df] + [df[df.columns[2:]] for df in dfs]  , axis = 1)
        new_df.to_csv('../../data/interim/' + str(year) + '_first_merge.csv',index=False)
        

We now merge frames vertically.

In [None]:
final_dataframe_name = '../../data/processed/all_data'

# first load the horizontally merged dataframes

dfs_train = []
dfs_test = []
year_insert = [2019, 2021, 2022, 2023, 2024]  # years to insert academic year column for
for i, year in enumerate(years):
    df = pd.read_csv(f'../../data/interim/{year}_first_merge.csv')
    df.insert(1, 'survey_year', year_insert[i])  # add the academic year column
    
    # Do a 50-50 split of the data for each year
    df_train, df_test = train_test_split(
    df, shuffle=True, random_state=440, test_size=0.5
    )
    # dfs.append(df)
    dfs_train.append(df_train)
    dfs_test.append(df_test)

# verify column names are consistent across different years
# Enough to do it for train dataframes

cols_agree = True
for i in range(1,len(dfs_train)):
    if not (dfs_train[0].columns.equals(dfs_train[i].columns)):
        print(f'columns from years {years[0]} and {years[i]} do not agree')
        cols_agree = False

else:
    print('columns match across different years')

# If all column names agree, we can merge vertically

if cols_agree:
    new_df_train = pd.concat( dfs_train  , axis = 0)
    new_df_test = pd.concat( dfs_test  , axis = 0)

    # Drop entries flagged for removal
    new_df_train = new_df_train[new_df_train['will_remove'] == 0]
    new_df_train = new_df_train.drop(columns=['will_remove'])

    new_df_test = new_df_test[new_df_test['will_remove'] == 0]
    new_df_test = new_df_test.drop(columns=['will_remove'])

    # Temporary solution (to be done in Saaransh notebooks)
    new_df_train = new_df_train[(new_df_train['age'] % 1 == 0.0) & (new_df_train['age'] <= 60)]
    new_df_test = new_df_test[(new_df_test['age'] % 1 == 0.0) & (new_df_test['age'] <= 60)]

    # To save space in the csv, turn columns to Int64
    # Exceptions: 'responseid' is a string and 'gpa_sr' is a float
    for column in new_df_train.columns:
        if column in ['responseid' , 'gpa_sr']:
            continue
        new_df_train[column] = new_df_train[column].astype('Int64')
        new_df_test[column] = new_df_test[column].astype('Int64')

    # Write complete dataframe to csv
    # new_df.to_csv(final_dataframe_name, index=False)
    new_df_train.to_csv(final_dataframe_name + '_train.csv', index=False)
    new_df_test.to_csv(final_dataframe_name + '_test.csv', index=False)

In [None]:
final_dataframe_name = '../../data/processed/all_data'
df_second_half = pd.read_csv('../../data/processed/all_data_test.csv')
for column in df_second_half.columns:
    if column in ['responseid' , 'gpa_sr']:
        continue
    df_second_half[column] = df_second_half[column].astype('Int64')
df_additional_train, df_validation = train_test_split(
    df_second_half, shuffle=True, random_state=440, test_size=0.4, stratify = df_second_half['survey_year']
    )
df_additional_train.to_csv(final_dataframe_name + '_additional_train.csv', index=False)
df_validation.to_csv(final_dataframe_name + '_validation.csv', index=False)