In [2]:
import pandas as pd
import os
import gc

input_path = r"C:\scaling_forecasts.csv"
output_path = r'C:\Users\noahi\Downloads\scaling_forecasts_CLEANED.csv'

# This clears any lingering Python connections to the file
gc.collect()

print("Attempting to open file...")

try:
    # We use 'with' which ensures the file is closed immediately after reading
    with pd.read_csv(input_path, chunksize=10000, encoding='latin-1', on_bad_lines='skip') as reader:
        # Delete the old output if it exists to avoid conflicts
        if os.path.exists(output_path):
            os.remove(output_path)
            
        for chunk in reader:
            # Clean
            chunk = chunk.dropna(how='all').drop_duplicates()
            
            # Save
            chunk.to_csv(output_path, mode='a', index=False, header=not os.path.exists(output_path))
            
    print("✅ FINAL SUCCESS! The cleaned file is ready.")

except PermissionError:
    print("❌ ERROR: The file is STILL locked. Try restarting your laptop or moving the file to your Desktop.")
except Exception as e:
    print(f"❌ DIFFERENT ERROR: {e}")

Attempting to open file...
❌ ERROR: The file is STILL locked. Try restarting your laptop or moving the file to your Desktop.


In [4]:
import pandas as pd
import shutil
import os

original_path = r"C:\scaling_forecasts.csv\scaling_forecasts.csv"

temp_path = r'C:\Users\noahi\Downloads\temp_copy.csv'
output_path = r'C:\Users\noahi\Downloads\scaling_forecasts_CLEANED.csv'

try:
    # 1. Force a copy of the file
    shutil.copy2(original_path, temp_path)
    print("Made a temp copy to bypass the lock...")

    # 2. Process the TEMP copy
    with pd.read_csv(temp_path, chunksize=10000, encoding='latin-1', on_bad_lines='skip') as reader:
        if os.path.exists(output_path):
            os.remove(output_path)
            
        for chunk in reader:
            chunk = chunk.dropna(how='all').drop_duplicates()
            chunk.to_csv(output_path, mode='a', index=False, header=not os.path.exists(output_path))
            
    print("✅ SUCCESS! Cleaned file created.")
    
    # 3. Cleanup temp file
    os.remove(temp_path)

except Exception as e:
    print(f"❌ Error: {e}")

Made a temp copy to bypass the lock...
✅ SUCCESS! Cleaned file created.


In [5]:
import pandas as pd

# Load just the first 5 rows to peek at the data
preview = pd.read_csv(r'C:\Users\noahi\Downloads\scaling_forecasts_CLEANED.csv', nrows=5)

print("--- Data Preview ---")
print(preview)

# Note: Counting rows in a huge file can be slow, 
# but this is the memory-efficient way to do it:
row_count = 0
for chunk in pd.read_csv(r'C:\Users\noahi\Downloads\scaling_forecasts_CLEANED.csv', chunksize=50000):
    row_count += len(chunk)

print(f"\nTotal rows in cleaned file: {row_count}")

--- Data Preview ---
   campus_id            timestamp  projected_deliveries  required_robots  \
0        3.0  2023-06-21 07:28:00                 108.0              7.0   
1        7.0  2025-03-21 04:13:00                 112.0              6.0   
2        5.0  2024-06-27 10:59:00                 134.0             11.0   
3        9.0  2020-07-29 06:57:00                 116.0             16.0   
4        3.0  2022-06-09 12:15:00                  87.0             17.0   

   expansion_cost  
0     3110.191200  
1     8393.164235  
2     9426.772879  
3     2183.157559  
4     3581.815198  

Total rows in cleaned file: 835071


In [6]:
import pandas as pd

path = r'C:\Users\noahi\Downloads\scaling_forecasts_CLEANED.csv'

# Since the blanks are gone, the file is smaller. 
# Let's try to load just the important columns to see if we can 
# do a final global duplicate check.
df = pd.read_csv(path, low_memory=False)

# Final global deduplicate
df = df.drop_duplicates()

# Save the absolute final version
df.to_csv(r'C:\Users\noahi\Downloads\scaling_forecasts_FINAL.csv', index=False)

print(f"Global clean finished. Final row count: {len(df)}")

Global clean finished. Final row count: 835071


In [7]:
import pandas as pd

df = pd.read_csv(r'C:\Users\noahi\Downloads\scaling_forecasts_FINAL.csv', nrows=1)
print(df.dtypes)

campus_id               float64
timestamp                object
projected_deliveries    float64
required_robots         float64
expansion_cost          float64
dtype: object


In [8]:
# Convert a 'Forecast_Value' column to actual numbers (float)
# errors='coerce' turns any remaining text junk into 'NaN' so it doesn't crash
df['campus_id'] = pd.to_numeric(df['campus_id'], errors='coerce')

# Convert a 'Date' column to actual dates

df['expansion_cost'] = pd.to_currency (df['expansion_cost'], errors='coerce')

# Save one last time with correct types
df.to_csv(r'C:\Users\noahi\Downloads\scaling_forecasts_FINAL_TYPED.csv', index=False)

AttributeError: module 'pandas' has no attribute 'to_currency'

In [9]:
import pandas as pd

# 1. Load the file
path = r'C:\Users\noahi\Downloads\scaling_forecasts_FINAL.csv'
df = pd.read_csv(path, low_memory=False)

# 2. Fix campus_id (Numeric)
df['campus_id'] = pd.to_numeric(df['campus_id'], errors='coerce')

# 3. Fix expansion_cost (Currency)
# We remove '$' and ',' then convert to float
df['expansion_cost'] = df['expansion_cost'].replace('[\$,]', '', regex=True)
df['expansion_cost'] = pd.to_numeric(df['expansion_cost'], errors='coerce')

# 4. Fix Date (Update 'date_column' to your actual column name, e.g., 'date')
# If your column is named 'date', use:
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
elif 'forecast_date' in df.columns:
    df['forecast_date'] = pd.to_datetime(df['forecast_date'], errors='coerce')

# 5. Save the result
output_path = r'C:\Users\noahi\Downloads\scaling_forecasts_FINAL_TYPED.csv'
df.to_csv(output_path, index=False)

print("✅ Success! Data types converted and file saved.")
print(df.dtypes) # This will show you the new types (float64, datetime, etc.)

  df['expansion_cost'] = df['expansion_cost'].replace('[\$,]', '', regex=True)


✅ Success! Data types converted and file saved.
campus_id               float64
timestamp                object
projected_deliveries    float64
required_robots         float64
expansion_cost          float64
dtype: object


In [10]:
# This shows you the Min, Max, and Average for your numbers
print(df[['expansion_cost', 'campus_id']].describe())

       expansion_cost      campus_id
count    8.341840e+05  834143.000000
mean              NaN       4.417082
std               NaN      33.175158
min              -inf    -999.000000
25%      3.242957e+03       3.000000
50%      5.499307e+03       6.000000
75%      7.753601e+03       8.000000
max               inf      10.000000


  return umr_sum(a, axis, dtype, out, keepdims, initial, where)


In [12]:
# This shows you the Min, Max, and Average for your numbers
print(df[['expansion_cost', 'campus_id', 'project_deliveries', 'required_robots']].describe())

KeyError: "['project_deliveries'] not in index"

In [13]:
# This shows you the top 5 most frequent values in a column
# Helpful for seeing if "N/A" or "0" is filling up your data
print(df['expansion_cost'].value_counts().head())

expansion_cost
 inf              167
-inf              163
-9.999997e+08       8
-9.999997e+295      6
-9.999997e+07       5
Name: count, dtype: int64


In [14]:
import numpy as np

# 1. Replace Infinity with NaN
df['expansion_cost'] = df['expansion_cost'].replace([np.inf, -np.inf], np.nan)

# 2. Replace those specific weird negative placeholders
# We catch anything smaller than a logical minimum (e.g., negative costs)
df.loc[df['expansion_cost'] < -1000, 'expansion_cost'] = np.nan

# 3. Final Check: See if they are gone
print("--- Cleaned Column Stats ---")
print(df['expansion_cost'].describe())

--- Cleaned Column Stats ---
count     8.335580e+05
mean     1.332974e+302
std                inf
min       1.000016e+03
25%       3.245973e+03
50%       5.500809e+03
75%       7.753513e+03
max      9.999997e+307
Name: expansion_cost, dtype: float64


  sqr = _ensure_numeric((avg - values) ** 2)


In [15]:
# This goes through every text column and removes extra spaces at the start or end
df_obj = df.select_dtypes(['object'])
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())

print("✅ Trailing spaces removed from all text columns!")

✅ Trailing spaces removed from all text columns!


In [16]:
import pandas as pd
import numpy as np

# 1. Convert Timestamp to actual Date/Time objects
# This allows you to filter by month, day, or hour later
df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')

# 2. Ensure numbers are floats (Decimals)
# We already cleaned expansion_cost, but we'll ensure the others are set too
num_cols = ['projected_deliveries', 'required_robots', 'campus_id', 'expansion_cost']

for col in num_cols:
    # Convert to numeric and turn errors/text into NaN
    df[col] = pd.to_numeric(df[col], errors='coerce')

# 3. Final Sanity Check: Remove any rows that became completely empty after conversion
df = df.dropna(how='all')

print("--- New Data Types ---")
print(df.dtypes)
print("\n--- First 5 Rows ---")
print(df.head())

--- New Data Types ---
campus_id                      float64
timestamp               datetime64[ns]
projected_deliveries           float64
required_robots                float64
expansion_cost                 float64
dtype: object

--- First 5 Rows ---
   campus_id           timestamp  projected_deliveries  required_robots  \
0        3.0 2023-06-21 07:28:00                 108.0              7.0   
1        7.0 2025-03-21 04:13:00                 112.0              6.0   
2        5.0 2024-06-27 10:59:00                 134.0             11.0   
3        9.0 2020-07-29 06:57:00                 116.0             16.0   
4        3.0 2022-06-09 12:15:00                  87.0             17.0   

   expansion_cost  
0     3110.191200  
1     8393.164235  
2     9426.772879  
3     2183.157559  
4     3581.815198  


In [20]:
import pandas as pd
import numpy as np

# 1. Load your latest file
path = r'C:\Users\noahi\Downloads\scaling_forecasts_PRODUCTION_READY.csv'
df = pd.read_csv(path)

# 2. Define the columns to fix
cols_to_fix = ['expansion_cost', 'projected_deliveries', 'required_robots']

for col in cols_to_fix:
    # A. First, convert 'inf' and extreme placeholders to NaN so they don't 
    # ruin the median calculation.
    # We'll treat anything larger than 1 billion as an error/placeholder.
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df.loc[df[col].abs() > 1e9, col] = np.nan
    
    # B. Calculate the Median (the middle value) of the REAL data
    median_value = df[col].median()
    
    # C. Fill the NaNs (the former outliers) with that median
    df[col] = df[col].fillna(median_value)
    
    print(f"Fixed {col}: Replaced outliers with median: {median_value:,.2f}")

# 3. Final Check: Your Mean and Std should now look normal
print("\n--- Final Cleaned Statistics ---")
print(df[cols_to_fix].describe())

# 4. Save the Final, Final version
df.to_csv(r'C:\Users\noahi\Downloads\scaling_forecasts_FINAL_CLEAN.csv', index=False)

Fixed expansion_cost: Replaced outliers with median: 5,499.29
Fixed projected_deliveries: Replaced outliers with median: 100.00
Fixed required_robots: Replaced outliers with median: 10.00

--- Final Cleaned Statistics ---
       expansion_cost  projected_deliveries  required_robots
count    8.350710e+05          8.350710e+05     8.350710e+05
mean     6.828947e+03          2.614505e+02    -3.763708e+01
std      1.099814e+06          1.643765e+05     2.969883e+05
min      1.000016e+03         -1.480000e+07    -1.900000e+08
25%      3.250057e+03          9.300000e+01     5.000000e+00
50%      5.499289e+03          1.000000e+02     1.000000e+01
75%      7.746154e+03          1.070000e+02     1.500000e+01
max      9.999997e+08          1.480000e+08     1.900000e+08


In [18]:
import os
path = r'C:\Users\noahi\Downloads\scaling_forecasts_PRODUCTION_READY.csv'
print(os.path.exists(path))

False


In [19]:
df.to_csv(r'C:\Users\noahi\Downloads\scaling_forecasts_PRODUCTION_READY.csv', index=False)
print("File saved and ready for reporting!")

File saved and ready for reporting!


In [21]:
import pandas as pd
import numpy as np

# 1. Load your latest file
path = r'C:\Users\noahi\Downloads\scaling_forecasts_PRODUCTION_READY.csv'
df = pd.read_csv(path)

# 2. Define the columns to fix
cols_to_fix = ['expansion_cost', 'projected_deliveries', 'required_robots']

for col in cols_to_fix:
    # A. First, convert 'inf' and extreme placeholders to NaN so they don't 
    # ruin the median calculation.
    # We'll treat anything larger than 1 billion as an error/placeholder.
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df.loc[df[col].abs() > 1e9, col] = np.nan
    
    # B. Calculate the Median (the middle value) of the REAL data
    median_value = df[col].median()
    
    # C. Fill the NaNs (the former outliers) with that median
    df[col] = df[col].fillna(median_value)
    
    print(f"Fixed {col}: Replaced outliers with median: {median_value:,.2f}")

# 3. Final Check: Your Mean and Std should now look normal
print("\n--- Final Cleaned Statistics ---")
print(df[cols_to_fix].describe())

# 4. Save the Final, Final version
df.to_csv(r'C:\Users\noahi\Downloads\scaling_forecasts_FINAL_CLEAN.csv', index=False)

Fixed expansion_cost: Replaced outliers with median: 5,499.29
Fixed projected_deliveries: Replaced outliers with median: 100.00
Fixed required_robots: Replaced outliers with median: 10.00

--- Final Cleaned Statistics ---
       expansion_cost  projected_deliveries  required_robots
count    8.350710e+05          8.350710e+05     8.350710e+05
mean     6.828947e+03          2.614505e+02    -3.763708e+01
std      1.099814e+06          1.643765e+05     2.969883e+05
min      1.000016e+03         -1.480000e+07    -1.900000e+08
25%      3.250057e+03          9.300000e+01     5.000000e+00
50%      5.499289e+03          1.000000e+02     1.000000e+01
75%      7.746154e+03          1.070000e+02     1.500000e+01
max      9.999997e+08          1.480000e+08     1.900000e+08


In [22]:
import pandas as pd
import numpy as np

# 1. Global Clean: Blanks & Duplicates
df = df.dropna(how='all') # Only drops if the ENTIRE row is empty
df = df.drop_duplicates()

# 2. Numeric Clean: Median for Outliers (Math columns only)
math_cols = ['expansion_cost', 'projected_deliveries', 'required_robots']

for col in math_cols:
    # Remove the "Infinity" and "Ghost" values first
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df.loc[df[col].abs() > 1e9, col] = np.nan
    
    # Fill gaps and outliers with Median
    med = df[col].median()
    df[col] = df[col].fillna(med)

# 3. ID & Date Clean: Remove rows with missing IDs (they are useless)
df = df.dropna(subset=['campus_id', 'timestamp'])

print("✅ Data is now 'Tidy': No duplicates, no blanks in key areas, and math is safe.")

✅ Data is now 'Tidy': No duplicates, no blanks in key areas, and math is safe.


In [None]:
which file bath file is the last tidy data