In [3]:
import pandas as pd

In [7]:
mig2005 = pd.read_excel('migration_flows_2005.xls')


In [8]:
# Drop the last 3 rows
mig2005 = mig2005.iloc[:-3].reset_index(drop=True)

In [9]:
# Find columns that contain 'MOE' as a value
moe_columns = [col for col in mig2005.columns if mig2005[col].astype(str).str.contains('MOE').any()]

# Drop those columns
mig2005 = mig2005.drop(columns=moe_columns)

In [10]:
# Drop the first 5 rows
mig2005 = mig2005.iloc[5:].reset_index(drop=True)

In [11]:
# Drop rows where all values are missing
mig2005 = mig2005.dropna(how='all')

# Reset the index after dropping rows
mig2005 = mig2005.reset_index(drop=True)

In [12]:
# Drop rows 2 and 33-35
mig2005 = mig2005.drop(index=[1, 32, 33, 34])

# Reset the index after dropping rows
mig2005 = mig2005.reset_index(drop=True)

In [13]:
# Make the first row first column equal to 0
mig2005.iloc[0, 0] = 0

# Set the first column as the index
mig2005 = mig2005.set_index(mig2005.columns[0])

In [14]:
# Drop columns where the first row has missing values
mig2005 = mig2005.loc[:, mig2005.iloc[0].notna()]

In [15]:
states = mig2005.iloc[0].values

data = []
for i in range(1, len(mig2005)):
    for j in range(len(mig2005.columns)):
        if mig2005.iloc[i, j] != '':
            data.append({
                'Origin': states[j],
                'Direction': mig2005.index[i],
                'Year': 2005,
                'Population Change': mig2005.iloc[i, j]
            })
flows_05 = pd.DataFrame(data)

In [2]:
import pandas as pd
import os
import glob

# For 2005-2009
def process_migration_file(file_path):
    try:
        print(f"Processing file: {file_path}")
        # Read the Excel file
        df = pd.read_excel(file_path)
        
        # Get the year from the filename
        year = int(file_path.split('_')[-1].split('.')[0])
        print(f"Extracted year: {year}")
        
        # Drop the last 3 rows
        df = df.iloc[:-3].reset_index(drop=True)
        
        # Find and drop MOE columns
        moe_columns = [col for col in df.columns if df[col].astype(str).str.contains('MOE').any()]
        df = df.drop(columns=moe_columns)
        
        # Drop the first 5 rows
        df = df.iloc[5:].reset_index(drop=True)
        
        # Drop rows where all values are missing
        df = df.dropna(how='all').reset_index(drop=True)
        
        try:
            # Drop specific rows (2 and 33-35)
            df = df.drop(index=[1, 32, 33, 34]).reset_index(drop=True)
        except KeyError as e:
            print(f"Warning: Some indices not found in {file_path}, continuing...")
        
        # Make the first row first column equal to 0
        df.iloc[0, 0] = 0
        
        # Set the first column as the index
        df = df.set_index(df.columns[0])
        
        # Drop columns where the first row has missing values
        df = df.loc[:, df.iloc[0].notna()]
        
        # Create flows dataframe
        states = df.iloc[0].values
        data = []
        for i in range(1, len(df)):
            for j in range(len(df.columns)):
                if pd.notna(df.iloc[i, j]) and df.iloc[i, j] != '':
                    data.append({
                        'Origin': states[j],
                        'Direction': df.index[i],
                        'Year': year,
                        'Population Change': df.iloc[i, j]
                    })
        
        result_df = pd.DataFrame(data)
        print(f"Successfully processed {file_path}, created {len(result_df)} rows")
        return result_df
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

# Get all Excel files in the directory
excel_files = glob.glob('*.xls*')
print(f"Found {len(excel_files)} Excel files: {excel_files}")

# Process all files and store results in a list
all_flows = []
for file in excel_files:
    flows = process_migration_file(file)
    if flows is not None:
        all_flows.append(flows)

# Check if we have any processed data
if len(all_flows) > 0:
    # Combine all flows into one dataframe
    combined_flows = pd.concat(all_flows, ignore_index=True)
    print(f"Created combined dataset with {len(combined_flows)} rows")
    
    # Save the combined results
    combined_flows.to_csv('migration_flows_combined.csv', index=False)
    print("Saved results to migration_flows_combined.csv")
else:
    print("No files were successfully processed!")


Found 17 Excel files: ['migration_flows_2005.xls', 'migration_flows_2006.xls', 'migration_flows_2007.xls', 'migration_flows_2008.xls', 'migration_flows_2009.xls', 'migration_flows_2010.xls', 'migration_flows_2011.xls', 'migration_flows_2012.xls', 'migration_flows_2013.xls', 'migration_flows_2014.xls', 'migration_flows_2015.xls', 'migration_flows_2016.xls', 'migration_flows_2017.xls', 'migration_flows_2021.xls', 'migration_flows_2022.xlsx', 'migration_flows_2023.xlsx', '~$migration_flows_2023.xlsx']
Processing file: migration_flows_2005.xls
Extracted year: 2005
Successfully processed migration_flows_2005.xls, created 2704 rows
Processing file: migration_flows_2006.xls
Extracted year: 2006
Successfully processed migration_flows_2006.xls, created 2704 rows
Processing file: migration_flows_2007.xls
Extracted year: 2007
Successfully processed migration_flows_2007.xls, created 2704 rows
Processing file: migration_flows_2008.xls
Extracted year: 2008
Successfully processed migration_flows_2008

In [3]:
mig2010 = pd.read_excel('migration_flows_2010.xls')


In [None]:
mig2010

In [4]:
# Drop the last 8 rows
mig2010 = mig2010.iloc[:-8].reset_index(drop=True)

# Find columns that contain 'MOE' as a value
moe_columns = [col for col in mig2010.columns if mig2010[col].astype(str).str.contains('MOE').any()]

# Drop those columns
mig2010 = mig2010.drop(columns=moe_columns)

# Drop rows where all values are missing
mig2010 = mig2010.dropna(how='all').reset_index(drop=True)

# Delete the second and third columns
mig2010 = mig2010.drop(mig2010.columns[[1, 2]], axis=1)

mig2010.iloc[0,0] = 0

# Find columns that contain 'Table 1' as a value
extra_columns = [col for col in mig2010.columns if mig2010[col].astype(str).str.contains('Table 1').any()]

# Drop those columns
mig2010 = mig2010.drop(columns=extra_columns)

# Drop the third column
mig2010 = mig2010.drop(mig2010.columns[2], axis=1)

# Drop first four rows
mig2010 = mig2010.iloc[4:].reset_index(drop=True)

mig2010.iloc[0,1] = 'Stayed'

# Delete rows 1, 2, 30, 31, 32
mig2010 = mig2010.drop(index=[1, 2, 30, 31, 32]).reset_index(drop=True)

mig2010.iloc[0,0] = "Origin"

# Make the first row the column names
mig2010.columns = mig2010.iloc[0]

# Delete the first row
mig2010 = mig2010.iloc[1:].reset_index(drop=True)

# Replace missing values where Origin matches column name with values from 'Stayed' column
for col in mig2010.columns:
    # Find the row where Origin equals the column name
    matching_row = mig2010[mig2010['Origin'] == col]
    if not matching_row.empty:
        # Get the 'Stayed' value for this state
        stayed_value = matching_row['Stayed'].values[0]
        # Replace the missing value in the corresponding column
        mig2010.loc[mig2010['Origin'] == col, col] = stayed_value


Found 17 Excel files: ['migration_flows_2005.xls', 'migration_flows_2006.xls', 'migration_flows_2007.xls', 'migration_flows_2008.xls', 'migration_flows_2009.xls', 'migration_flows_2010.xls', 'migration_flows_2011.xls', 'migration_flows_2012.xls', 'migration_flows_2013.xls', 'migration_flows_2014.xls', 'migration_flows_2015.xls', 'migration_flows_2016.xls', 'migration_flows_2017.xls', 'migration_flows_2021.xls', 'migration_flows_2022.xlsx', 'migration_flows_2023.xlsx', '~$migration_flows_2023.xlsx']
Processing file: migration_flows_2005.xls
Extracted year: 2005
Successfully processed migration_flows_2005.xls, created 2756 rows
Processing file: migration_flows_2006.xls
Extracted year: 2006
Successfully processed migration_flows_2006.xls, created 2756 rows
Processing file: migration_flows_2007.xls
Extracted year: 2007
Successfully processed migration_flows_2007.xls, created 2756 rows
Processing file: migration_flows_2008.xls
Extracted year: 2008
Successfully processed migration_flows_2008

In [3]:
#combined_flows = pd.read_csv('migration_flows_combined.csv')


#combined_flows.head()

combined_flows.tail(20)




Unnamed: 0,Origin,Direction,Year,Population Change
16195,Puerto Rico,Ohio,2023,0
16196,Puerto Rico,Oklahoma,2023,0
16197,Puerto Rico,Oregon,2023,0
16198,Puerto Rico,Pennsylvania,2023,3040
16199,Puerto Rico,Rhode Island,2023,0
16200,Puerto Rico,South Carolina,2023,314
16201,Puerto Rico,South Dakota,2023,0
16202,Puerto Rico,Tennessee,2023,293
16203,Puerto Rico,Texas,2023,2252
16204,Puerto Rico,Utah,2023,37


In [1]:
import pandas as pd
import os
import glob

def process_migration_file(file_path):
    try:
        print(f"Processing file: {file_path}")
        # Read the Excel file
        df = pd.read_excel(file_path)
        
        # Get the year from the filename
        year = int(file_path.split('_')[-1].split('.')[0])
        print(f"Extracted year: {year}")
        
        if year >= 2010:
            # Process 2010 and later files
            # Drop the last 8 rows
            df = df.iloc[:-8].reset_index(drop=True)
            
            # Find and drop MOE columns
            moe_columns = []
            for col in df.columns:
                if df[col].astype(str).str.contains('MOE').any():
                    moe_columns.append(col)
            df = df.drop(columns=moe_columns)
            
            # Drop rows where all values are missing
            df = df.dropna(how='all').reset_index(drop=True)
            
            # Delete the second and third columns
            df = df.drop(df.columns[[1, 2]], axis=1)
           
            
            # Drop the third column if it exists
            if len(df.columns) > 2:
                df = df.drop(df.columns[2], axis=1)
            
            # Drop first four rows
            df = df.iloc[4:].reset_index(drop=True)
            
            df.iloc[0,1] = 'Stayed'
            
            # Delete specific rows
            df = df.drop(index=[1, 2, 30, 31, 32]).reset_index(drop=True)
            
            df.iloc[0,0] = "Origin"
            
            # Make the first row the column names
            df.columns = df.iloc[0]
            df = df.iloc[1:].reset_index(drop=True)
            
            # Replace missing values where Origin matches column name with values from 'Stayed' column
            for col in df.columns:
                if col in ['Origin', 'Stayed']:
                    continue
                # Find the row where Origin equals the column name
                matching_rows = df[df['Origin'].astype(str) == str(col)]
                if not matching_rows.empty:
                    # Get the 'Stayed' value for this state
                    stayed_value = matching_rows['Stayed'].values[0]
                    # Replace the missing value in the corresponding column
                    df.loc[df['Origin'].astype(str) == str(col), col] = stayed_value
            
            # Delete the 'Stayed' column
            df = df.drop(columns=['Stayed'])
            
            # Create flows dataframe for 2010+ files
            data = []
            for idx, row in df.iterrows():
                origin = row['Origin']
                for col in df.columns:
                    if col != 'Origin' and pd.notna(row[col]) and str(row[col]).strip() != '':
                        data.append({
                            'Origin': origin,
                            'Direction': col,
                            'Year': year,
                            'Population Change': row[col]
                        })
            
        else:
            # Process pre-2010 files using original method
            # Drop the last 3 rows
            df = df.iloc[:-3].reset_index(drop=True)
            
            # Find and drop MOE columns
            moe_columns = []
            for col in df.columns:
                if df[col].astype(str).str.contains('MOE').any():
                    moe_columns.append(col)
            df = df.drop(columns=moe_columns)
            
            # Drop the first 5 rows
            df = df.iloc[5:].reset_index(drop=True)
            
            # Drop rows where all values are missing
            df = df.dropna(how='all').reset_index(drop=True)
            
            try:
                # Drop specific rows (2 and 33-35)
                df = df.drop(index=[1, 32, 33, 34]).reset_index(drop=True)
            except KeyError as e:
                print(f"Warning: Some indices not found in {file_path}, continuing...")
            
            # Make the first row first column equal to 0
            df.iloc[0, 0] = 0
            
            # Set the first column as the index
            df = df.set_index(df.columns[0])
            
            # Drop columns where the first row has missing values
            df = df.loc[:, df.iloc[0].notna()]
            
            # Create flows dataframe for pre-2010 files
            states = df.iloc[0].values
            data = []
            for i in range(1, len(df)):
                for j in range(len(df.columns)):
                    if pd.notna(df.iloc[i, j]) and str(df.iloc[i, j]).strip() != '':
                        data.append({
                            'Origin': states[j],
                            'Direction': df.index[i],
                            'Year': year,
                            'Population Change': df.iloc[i, j]
                        })
        
        result_df = pd.DataFrame(data)
        print(f"Successfully processed {file_path}, created {len(result_df)} rows")
        return result_df
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

# Get all Excel files in the directory
excel_files = glob.glob('*.xls*')
print(f"Found {len(excel_files)} Excel files: {excel_files}")

# Process all files and store results in a list
all_flows = []
for file in excel_files:
    if not file.startswith('~$'):  # Skip temporary Excel files
        flows = process_migration_file(file)
        if flows is not None:
            all_flows.append(flows)

# Check if we have any processed data
if len(all_flows) > 0:
    # Combine all flows into one dataframe
    combined_flows = pd.concat(all_flows, ignore_index=True)
    print(f"Created combined dataset with {len(combined_flows)} rows")
    
    # Save the combined results
    combined_flows.to_csv('migration_flows_combined.csv', index=False)
    print("Saved results to migration_flows_combined.csv")
else:
    print("No files were successfully processed!")

Found 17 Excel files: ['migration_flows_2005.xls', 'migration_flows_2006.xls', 'migration_flows_2007.xls', 'migration_flows_2008.xls', 'migration_flows_2009.xls', 'migration_flows_2010.xls', 'migration_flows_2011.xls', 'migration_flows_2012.xls', 'migration_flows_2013.xls', 'migration_flows_2014.xls', 'migration_flows_2015.xls', 'migration_flows_2016.xls', 'migration_flows_2017.xls', 'migration_flows_2021.xls', 'migration_flows_2022.xlsx', 'migration_flows_2023.xlsx', '~$migration_flows_2023.xlsx']
Processing file: migration_flows_2005.xls
Extracted year: 2005
Successfully processed migration_flows_2005.xls, created 2704 rows
Processing file: migration_flows_2006.xls
Extracted year: 2006
Successfully processed migration_flows_2006.xls, created 2704 rows
Processing file: migration_flows_2007.xls
Extracted year: 2007
Successfully processed migration_flows_2007.xls, created 2704 rows
Processing file: migration_flows_2008.xls
Extracted year: 2008
Successfully processed migration_flows_2008

In [8]:

df = pd.read_excel('migration_flows_2010.xls')

df = df.iloc[:-8].reset_index(drop=True)
            
            # Find and drop MOE columns
moe_columns = [col for col in df.columns if df[col].astype(str).str.contains('MOE').any()]
df = df.drop(columns=moe_columns)
            
            # Drop rows where all values are missing
df = df.dropna(how='all').reset_index(drop=True)
            
            # Delete the second and third columns
#df = df.drop(df.columns[[1, 2]], axis=1)
            
            # Find columns that contain 'Table 1' as a value
#extra_columns = [col for col in df.columns if df[col].astype(str).str.contains('Table 1').any()]
#df = df.drop(columns=extra_columns)
            
           
            # Drop first four rows
df = df.iloc[4:].reset_index(drop=True)
            
df.iloc[0,1] = 'Stayed'
            
            # Delete specific rows
df = df.drop(index=[1, 2, 30, 31, 32]).reset_index(drop=True)
            
df.iloc[0,0] = "Origin"
            
            # Make the first row the column names
df.columns = df.iloc[0]
df = df.iloc[1:].reset_index(drop=True)
            
            # Replace missing values where Origin matches column name with values from 'Stayed' column
for col in df.columns:
    if col in ['Origin', 'Stayed']:
        continue
                # Find the row where Origin equals the column name
    matching_row = df[df['Origin'] == col]
    if not matching_row.empty:
                    # Get the 'Stayed' value for this state
            stayed_value = matching_row['Stayed'].values[0]
                    # Replace the missing value in the corresponding column
            df.loc[df['Origin'] == col, col] = stayed_value

            # Delete the 'Stayed' column
df = df.drop(columns=['Stayed'])

#Delete columns titled nan
df = df.drop(columns=[col for col in df.columns if 'nan' in str(col)])
            

In [1]:
import pandas as pd
import os
import glob

def process_migration_file(file_path):
    try:
        if file_path.startswith('~$'):  # Skip temporary Excel files
            return None
            
        print(f"Processing file: {file_path}")
        # Read the Excel file
        df = pd.read_excel(file_path)
        
        # Get the year from the filename
        year = int(file_path.split('_')[-1].split('.')[0])
        print(f"Extracted year: {year}")
        
        if year >= 2010:
            # Process 2010 and later files
            # Drop the last 8 rows
            df = df.iloc[:-8].reset_index(drop=True)
            
            # Find and drop MOE columns
            moe_columns = []
            for col in df.columns:
                if isinstance(col, str) and 'MOE' in col:
                    moe_columns.append(col)
                elif df[col].astype(str).str.contains('MOE').any():
                    moe_columns.append(col)
            df = df.drop(columns=moe_columns)
            
            # Drop rows where all values are missing
            df = df.dropna(how='all').reset_index(drop=True)
            
            # Drop first four rows
            df = df.iloc[4:].reset_index(drop=True)
            
            # Set 'Stayed' in second column header
            df.iloc[0, 1] = 'Stayed'
            
            # Delete specific rows
            rows_to_drop = [i for i in [1, 2, 30, 31, 32] if i < len(df)]
            df = df.drop(index=rows_to_drop).reset_index(drop=True)
            
            # Set "Origin" as first column header
            df.iloc[0, 0] = "Origin"
            
            # Make the first row the column names
            df.columns = df.iloc[0]
            df = df.iloc[1:].reset_index(drop=True)
            
            # Drop any columns with 'nan' in the name
            df = df.drop(columns=[col for col in df.columns if 'nan' in str(col).lower()])
            
            # Replace missing values where Origin matches column name
            data = []
            for idx, row in df.iterrows():
                origin = row['Origin']
                stayed_value = row.get('Stayed', None)
                
                for col in df.columns:
                    if col not in ['Origin', 'Stayed']:
                        value = row[col]
                        # If this is the row where Origin matches the column name, use the Stayed value
                        if str(origin) == str(col):
                            value = stayed_value
                        
                        if pd.notna(value) and str(value).strip() != '':
                            try:
                                value = float(value)
                                data.append({
                                    'Origin': origin,
                                    'Direction': col,
                                    'Year': year,
                                    'Population Change': value
                                })
                            except (ValueError, TypeError):
                                print(f"Warning: Could not convert value '{value}' to float")
                                continue
            
        else:
            # Process pre-2010 files
            # Drop the last 3 rows
            df = df.iloc[:-3].reset_index(drop=True)
            
            # Find and drop MOE columns
            moe_columns = []
            for col in df.columns:
                if isinstance(col, str) and 'MOE' in col:
                    moe_columns.append(col)
                elif df[col].astype(str).str.contains('MOE').any():
                    moe_columns.append(col)
            df = df.drop(columns=moe_columns)
            
            # Drop the first 5 rows
            df = df.iloc[5:].reset_index(drop=True)
            
            # Drop rows where all values are missing
            df = df.dropna(how='all').reset_index(drop=True)
            
            try:
                # Drop specific rows (2 and 33-35)
                df = df.drop(index=[1, 32, 33, 34]).reset_index(drop=True)
            except KeyError as e:
                print(f"Warning: Some indices not found in {file_path}, continuing...")
            
            # Make the first row first column equal to 0
            df.iloc[0, 0] = 0
            
            # Set the first column as the index
            df = df.set_index(df.columns[0])
            
            # Drop columns where the first row has missing values
            df = df.loc[:, df.iloc[0].notna()]
            
            # Create flows dataframe
            states = df.iloc[0].values
            data = []
            for i in range(1, len(df)):
                for j in range(len(df.columns)):
                    value = df.iloc[i, j]
                    if pd.notna(value) and str(value).strip() != '':
                        try:
                            value = float(value)
                            data.append({
                                'Origin': states[j],
                                'Direction': df.index[i],
                                'Year': year,
                                'Population Change': value
                            })
                        except (ValueError, TypeError):
                            print(f"Warning: Could not convert value '{value}' to float")
                            continue
        
        result_df = pd.DataFrame(data)
        print(f"Successfully processed {file_path}, created {len(result_df)} rows")
        return result_df
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

# Get all Excel files in the directory
excel_files = glob.glob('*.xls*')
print(f"Found {len(excel_files)} Excel files: {excel_files}")

# Process all files and store results in a list
all_flows = []
for file in excel_files:
    flows = process_migration_file(file)
    if flows is not None:
        all_flows.append(flows)

# Check if we have any processed data
if len(all_flows) > 0:
    # Combine all flows into one dataframe
    combined_flows = pd.concat(all_flows, ignore_index=True)
    print(f"Created combined dataset with {len(combined_flows)} rows")
    
    # Save the combined results
    combined_flows.to_csv('migration_flows_combined.csv', index=False)
    print("Saved results to migration_flows_combined.csv")
else:
    print("No files were successfully processed!")

Found 17 Excel files: ['migration_flows_2005.xls', 'migration_flows_2006.xls', 'migration_flows_2007.xls', 'migration_flows_2008.xls', 'migration_flows_2009.xls', 'migration_flows_2010.xls', 'migration_flows_2011.xls', 'migration_flows_2012.xls', 'migration_flows_2013.xls', 'migration_flows_2014.xls', 'migration_flows_2015.xls', 'migration_flows_2016.xls', 'migration_flows_2017.xls', 'migration_flows_2021.xls', 'migration_flows_2022.xlsx', 'migration_flows_2023.xlsx', '~$migration_flows_2023.xlsx']
Processing file: migration_flows_2005.xls
Extracted year: 2005
Successfully processed migration_flows_2005.xls, created 2704 rows
Processing file: migration_flows_2006.xls
Extracted year: 2006
Successfully processed migration_flows_2006.xls, created 2704 rows
Processing file: migration_flows_2007.xls
Extracted year: 2007
Successfully processed migration_flows_2007.xls, created 2704 rows
Processing file: migration_flows_2008.xls
Extracted year: 2008
Successfully processed migration_flows_2008

In [4]:
mig2005 = pd.read_excel('migration_flows_2005.xls')
mig2005


Unnamed: 0,"Table with row headers in column A, L, W, AH, AS, BD, BO, BZ, CK, CV, and DG, and column headers in rows 6 through 8 and 46 through 48.",Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 105,Unnamed: 106,Unnamed: 107,Unnamed: 108,Unnamed: 109,Unnamed: 110,Unnamed: 111,Unnamed: 112,Unnamed: 113,Unnamed: 114
0,"Table. Movers Within and Between States, the ...",,,,,,,,,,...,,,,,,"Table. Movers Within and Between States, the ...",,,,
1,Dataset: 2005 American Community Survey 1-Year...,,,,,,,,,,...,,,,,,Dataset: 2005 American Community Survey 1-Year...,,,,
2,Universe: Population 1 year and over who moved...,,,,,,,,,,...,,,,,,Universe: Population 1 year and over who moved...,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,Current residence in --,Residence 1 year ago in --,,,,,,,,,...,,,,,,Current residence in --,Residence 1 year ago in --,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72,Wisconsin,2275,2656,136,137,2197,942,992,762,6743,...,958,2390,1532,242,278,Wisconsin,672491,19133,536,560
73,Wyoming,268,274,146,210,1229,1144,178,295,3353,...,514,781,561,49,83,Wyoming,35,66,61851,6023
74,Footnotes:,,,,,,,,,,...,,,,,,,,,,
75,MOE - Margin of error based on 90% confidence ...,,,,,,,,,,...,,,,,,MOE - Margin of error based on 90% confidence ...,,,,


In [1]:
import pandas as pd
import os
import glob

def process_migration_file(file_path):
    try:
        if file_path.startswith('~$'):  # Skip temporary Excel files
            return None
            
        print(f"Processing file: {file_path}")
        # Read the Excel file
        df = pd.read_excel(file_path)
        
        # Get the year from the filename
        year = int(file_path.split('_')[-1].split('.')[0])
        print(f"Extracted year: {year}")
        
        # Drop the last rows (3 for pre-2010, 8 for 2010+)
        if year >= 2010:
            df = df.iloc[:-8].reset_index(drop=True)
        else:
            df = df.iloc[:-3].reset_index(drop=True)
        
        # Find columns that contain 'MOE' as a value and drop them
        moe_columns = []
        for col in df.columns:
            col_str = df[col].astype(str)
            if col_str.str.contains('MOE').any():
                moe_columns.append(col)
        df = df.drop(columns=moe_columns)
        
        # Drop the first 5 rows
        df = df.iloc[5:].reset_index(drop=True)
        
        # Drop rows where all values are missing
        df = df.dropna(how='all').reset_index(drop=True)
        
        # Drop specific rows
        try:
            if year >= 2010:
                df = df.drop(index=[1, 2, 30, 31, 32]).reset_index(drop=True)
            else:
                df = df.drop(index=[1, 32, 33, 34]).reset_index(drop=True)
        except KeyError:
            pass
        
        # Make the first row first column equal to 0
        df.iloc[0, 0] = 0
        
        # Set the first column as the index
        first_col = df.columns[0]
        df = df.set_index(first_col)
        
        # Drop columns where the first row has missing values
        df = df.loc[:, df.iloc[0].notna()]
        
        # Get states from the first row
        states = df.iloc[0].values
        
        # Create flows dataframe
        data = []
        for i in range(1, len(df)):
            for j in range(len(df.columns)):
                value = df.iloc[i, j]
                if pd.notna(value) and str(value).strip() != '':
                    try:
                        # Convert value to float, handling any commas
                        clean_value = str(value).replace(',', '')
                        float_value = float(clean_value)
                        data.append({
                            'Direction': states[j],
                            'Origin': df.index[i],
                            'Year': year,
                            'Population Change': float_value
                        })
                    except (ValueError, TypeError):
                        continue
        
        result_df = pd.DataFrame(data)
        print(f"Successfully processed {file_path}, created {len(result_df)} rows")
        return result_df
    
    except Exception as e:
        print(f"Error processing {file_path}: {str(e)}")
        return None

# Get all Excel files in the directory
excel_files = glob.glob('*.xls*')
print(f"Found {len(excel_files)} Excel files: {excel_files}")

# Process all files and store results in a list
all_flows = []
for file in excel_files:
    if not file.startswith('~$'):  # Skip temporary Excel files
        flows = process_migration_file(file)
        if flows is not None:
            all_flows.append(flows)

# Check if we have any processed data
if len(all_flows) > 0:
    # Combine all flows into one dataframe
    combined_flows = pd.concat(all_flows, ignore_index=True)
    print(f"Created combined dataset with {len(combined_flows)} rows")
    
    # Save the combined results
    combined_flows.to_csv('migration_flows_combined.csv', index=False)
    print("Saved results to migration_flows_combined.csv")
else:
    print("No files were successfully processed!")

Found 19 Excel files: ['migration_flows_2005.xls', 'migration_flows_2006.xls', 'migration_flows_2007.xls', 'migration_flows_2008.xls', 'migration_flows_2009.xls', 'migration_flows_2010.xls', 'migration_flows_2011.xls', 'migration_flows_2012.xls', 'migration_flows_2013.xls', 'migration_flows_2014.xls', 'migration_flows_2015.xls', 'migration_flows_2016.xls', 'migration_flows_2017.xls', 'migration_flows_2018.xls', 'migration_flows_2019.xls', 'migration_flows_2021.xls', 'migration_flows_2022.xlsx', 'migration_flows_2023.xlsx', '~$migration_flows_2023.xlsx']
Processing file: migration_flows_2005.xls
Extracted year: 2005
Successfully processed migration_flows_2005.xls, created 2704 rows
Processing file: migration_flows_2006.xls
Extracted year: 2006
Successfully processed migration_flows_2006.xls, created 2704 rows
Processing file: migration_flows_2007.xls
Extracted year: 2007
Successfully processed migration_flows_2007.xls, created 2704 rows
Processing file: migration_flows_2008.xls
Extracte

In [2]:
combined_flows = pd.read_csv('migration_flows_combined.csv')

In [4]:
combined_flows_2013 = combined_flows[combined_flows['Year'] == 2013]
combined_flows_2013.head()




Unnamed: 0,Direction,Origin,Year,Population Change
21935,Total,Alabama,2013,104102.0
21936,Alaska,Alabama,2013,1026.0
21937,Arizona,Alabama,2013,1077.0
21938,Arkansas,Alabama,2013,1108.0
21939,California,Alabama,2013,4918.0
