# Imports and Modules

In [2]:
import pandas as pd
import csv
from tqdm import tqdm

# Text to Dataframe

In [3]:
def txt_to_dataframe(file_path, column_names=None):
    # Read the .txt file into a DataFrame; file is tab-separated
    df = pd.read_csv(file_path, 
                     sep='\t', 
                     header=None, 
                     names=column_names, 
                     encoding='latin1',
                     na_values=[''], # Treat empty fields as NaN
                     keep_default_na=True) # Keep the default NaN recognizers
    return df

# Load data
file_path = r'data/state_city.txt'
# Headers
column_names = ['Latitude', 'Longitude', 'City', 'State']
# Read the file into a DataFrame
df2 = txt_to_dataframe(file_path, column_names)

# Display the first few rows of the DataFrame
print(df2.head())

# Get info about the DataFrame
print(df2.info())

# If you want to save it as a different file format, like CSV:
df2.to_csv('state_city.csv', index=False)

    Latitude   Longitude                                     City State
0  13.686980  100.609881  à¸à¸£à¸¸à¸à¹à¸à¸à¸¡à¸«à¸²à¸à¸à¸£   NaN
1  13.690787  100.609060  à¸à¸£à¸¸à¸à¹à¸à¸à¸¡à¸«à¸²à¸à¸à¸£   NaN
2  13.699518  100.605383  à¸à¸£à¸¸à¸à¹à¸à¸à¸¡à¸«à¸²à¸à¸à¸£   NaN
3  13.711378  100.597386  à¸à¸£à¸¸à¸à¹à¸à¸à¸¡à¸«à¸²à¸à¸à¸£   NaN
4  13.715302  100.591672  à¸à¸£à¸¸à¸à¹à¸à¸à¸¡à¸«à¸²à¸à¸à¸£   NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46322 entries, 0 to 46321
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Latitude   46322 non-null  float64
 1   Longitude  46322 non-null  float64
 2   City       42482 non-null  object 
 3   State      45966 non-null  object 
dtypes: float64(2), object(2)
memory usage: 1.4+ MB
None


# Preprocess file 'state_city.txt'

In [4]:
# Method 1: Sort by State and then by City
def sort_by_state_and_city(df):
    return df.sort_values(['State', 'City'])

# Method 2: Group by State and City, aggregating latitude and longitude
def group_by_state_and_city(df):
    return df.groupby(['State', 'City']).agg({
        'Latitude': 'mean',
        'Longitude': 'mean'
    }).reset_index()

# Apply Method 1
df_sorted = sort_by_state_and_city(df2)
print("Sorted by State and City:")
print(df_sorted)

# Apply Method 2
df_grouped = group_by_state_and_city(df2)
print("\nGrouped by State and City (with average coordinates):")
print(df_grouped)

# Save the results
df_sorted.to_csv(r'data/sorted_cities.csv', index=False)
df_grouped.to_csv(r'data/grouped_cities.csv', index=False)

# To get a list of unique cities in each state
cities_by_state = df2.groupby('State')['City'].unique()
print("\nUnique cities in each state:")
print(cities_by_state)

Sorted by State and City:
        Latitude   Longitude            City    State
8295   33.580862  -86.956455      Adamsville  Alabama
7055   32.923909  -85.949771  Alexander City  Alabama
14802  34.778611  -86.944281          Athens  Alabama
14853  34.790520  -86.975996          Athens  Alabama
14868  34.798632  -86.975188          Athens  Alabama
...          ...         ...             ...      ...
8      18.059533  -76.126374             NaN      NaN
28     21.806420 -112.150270             NaN      NaN
239    25.717681  -70.942583             NaN      NaN
1244   26.133651  -82.321223             NaN      NaN
7667   33.428679  -70.602265             NaN      NaN

[46322 rows x 4 columns]

Grouped by State and City (with average coordinates):
                                                  State  \
0                                               Alabama   
1                                               Alabama   
2                                               Alabama   
3        

## Extract USA states from df

In [6]:
us_states = {
    'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware',
    'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
    'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi',
    'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico',
    'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania',
    'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont',
    'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming', 'District of Columbia',
    'American Samoa', 'Guam', 'Northern Mariana Islands', 'Puerto Rico', 'U.S. Virgin Islands'
}

# Select from above df
usa_df = df_grouped[df_grouped['State'].isin(us_states)]

# Save .csv
usa_df.to_csv(r'data/usa_df.csv', index=False)

print(usa_df.head())


     State            City   Latitude  Longitude
0  Alabama      Adamsville  33.580862 -86.956455
1  Alabama  Alexander City  32.923909 -85.949771
2  Alabama          Athens  34.806198 -86.958965
3  Alabama          Auburn  32.594473 -85.480678
4  Alabama        Bessemer  33.336886 -86.955847


## Round the coordinates

In [11]:
# Create a new dataframe with rounded values
usa_df_rounded = usa_df.copy()

# Round Latitude and Longitude to 3 decimal places
usa_df_rounded['Latitude'] = usa_df_rounded['Latitude'].round(3)
usa_df_rounded['Longitude'] = usa_df_rounded['Longitude'].round(3)

# Save to .csv
usa_df_rounded.to_csv('usa_df_rounded.csv', index=False)

print(usa_df_rounded.head())

     State            City  Latitude  Longitude
0  Alabama      Adamsville    33.581    -86.956
1  Alabama  Alexander City    32.924    -85.950
2  Alabama          Athens    34.806    -86.959
3  Alabama          Auburn    32.594    -85.481
4  Alabama        Bessemer    33.337    -86.956
