In [1]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import os
import seaborn as sns
from tqdm.auto import tqdm

## 集大成！

### 集结P1A3

In [3]:
parent_directory = '/scratch/work/lyub2/Problem_output_September/'

android_missing_counts_total = 0
ios_missing_counts_total = 0
df_list = []

# Iterate through all 30 folders with tqdm for progress bar
for day in tqdm(range(1, 2), desc="Processing folders"):
    # Generate folder name in the format 'MMDD_output_csv' (e.g., '0901_output_csv')
    folder_name = f"09{str(day).zfill(2)}_output_csv"
    # Define the full path to the csv file
    file_path = os.path.join(parent_directory, folder_name, 'P1A3_missing_values_os.csv')
    
    # Check if the file exists to avoid errors
    if not os.path.isfile(file_path):
        print(f"File not found: {file_path}")
        continue
    
    # Read the current CSV file
    df = pd.read_csv(file_path)
    
    android_missing_counts_total += df['0_Android_mssing_counts'].sum()
    ios_missing_counts_total += df['1_iOS_mssing_counts'].sum()
    
    df_list.append(df)
    
total_counts = android_missing_counts_total + ios_missing_counts_total

android_missing_percentage_total = (android_missing_counts_total / total_counts) * 100 if total_counts > 0 else 0
ios_missing_percentage_total = (ios_missing_counts_total / total_counts) * 100 if total_counts > 0 else 0

# Create a final DataFrame to store the results
df_a3 = {
    '0_Android_missing_counts': [android_missing_counts_total],
    '1_iOS_missing_counts': [ios_missing_counts_total],
    '0_Android_missing_percentage': [android_missing_percentage_total],
    '1_iOS_missing_percentage': [ios_missing_percentage_total]
}

df_a3 = pd.DataFrame(df_a3)

#df_a3.to_csv('/scratch/work/lyub2/Problem_output_September/P1A3/df_a3.csv', index = False)

Processing folders:   0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
df_a3

Unnamed: 0,0_Android_missing_counts,1_iOS_missing_counts,0_Android_missing_percentage,1_iOS_missing_percentage
0,6490253,1089606,85.624983,14.375017


### 统计全部 上面的不行

In [25]:
# Define the folder and common file structure
folder_path = '/scratch/work/lyub2/2.df_tc_09'
file_prefix = 'df_tc_'
file_suffix = '.csv'

# Columns to check for missing values
columns_to_check = ['leg_order', 'id', 'activity', 'km', 'time_start_distance', 'time_end_distance']
cols = ['leg_order', 'id', 'activity', 'km', 'time_start_distance', 'time_end_distance', 'operatingSystem']
# Initialize accumulators for total counts
total_os_counts = pd.Series([0, 0], index=[0, 1])  # Using 0 for Android, 1 for iOS
total_missing_counts = pd.DataFrame(0, index=columns_to_check, columns=['0_Android_missing_counts', '1_iOS_missing_counts'])

# Loop through each CSV file
for i in tqdm(range(1, 31), desc="Processing files"):
    file_name = f'{file_prefix}{i:02d}{file_suffix}'
    file_path = os.path.join(folder_path, file_name)
    
    # Check if the file exists to avoid errors
    if not os.path.isfile(file_path):
        print(f"File not found: {file_path}")
        continue
    
    # Read the CSV file
    df_tc = pd.read_csv(file_path, usecols = cols)
    
    # Get the operating system counts for this file
    os_counts = df_tc['operatingSystem'].value_counts()
    os_counts = os_counts.reindex([0, 1], fill_value=-1)
    
    # Update the total counts of operating systems
    total_os_counts += os_counts

    # Group by 'operatingSystem' and count missing values for each column
    missing_values_os = df_tc.groupby('operatingSystem')[columns_to_check].apply(lambda x: x.isna().sum())

    # Update the total missing counts
    total_missing_counts['0_Android_missing_counts'] += missing_values_os.loc[0, columns_to_check].values
    total_missing_counts['1_iOS_missing_counts'] += missing_values_os.loc[1, columns_to_check].values

# Calculate the overall missing percentages after accumulating all counts
total_missing_percentages = pd.DataFrame(0, index=columns_to_check, columns=['0_Android_missing_percentage', '1_iOS_missing_percentage'])
total_missing_percentages['0_Android_missing_percentage'] = round((total_missing_counts['0_Android_missing_counts'] / total_os_counts[0]) * 100, 2)
total_missing_percentages['1_iOS_missing_percentage'] = round((total_missing_counts['1_iOS_missing_counts'] / total_os_counts[1]) * 100, 2)

# Create the final DataFrame combining counts and percentages
final_df = pd.concat([total_missing_counts, total_missing_percentages], axis=1)



Processing files:   0%|          | 0/30 [00:00<?, ?it/s]

In [23]:
total_os_counts

0    18409304
1     2691171
dtype: int64

In [26]:
total_os_counts

0    258374813
1     34103689
dtype: int64

In [27]:
final_df

Unnamed: 0,0_Android_missing_counts,1_iOS_missing_counts,0_Android_missing_percentage,1_iOS_missing_percentage
leg_order,24689763,3869391,9.56,11.35
id,24706800,3870436,9.56,11.35
activity,24706800,3870436,9.56,11.35
km,24706800,3870436,9.56,11.35
time_start_distance,24706800,3870436,9.56,11.35
time_end_distance,24706800,3870436,9.56,11.35


In [28]:
final_df.to_csv('/scratch/work/lyub2/Problem_output_September/P1A3/df_a3.csv')