# Imports and Modules

In [2]:
import pandas as pd
import csv
from tqdm import tqdm

# Preprocess file 'full_text.txt'

In [3]:
def txt_to_dataframe(file_path, column_names=None):
    # Read the .txt file into a DataFrame
    # File is tab-seperated
    df = pd.read_csv(file_path, sep='\t', header=None, names=column_names, encoding='latin1')
    return df

# Load data:
file_path = r'data/full_text.txt'

# Headers
column_names = ['UserID', 'Timestamp', 'Coordinates', 'Latitude', 'Longitude', 'TweetText']

# Read the file into a DataFrame
df = txt_to_dataframe(file_path, column_names)

# Display the first few rows of the DataFrame
print(df.head())

# Drop a column
df_dropped = df.drop(columns=['Coordinates'])
print("\nDataFrame with 'Coordinates' dropped:")
print(df_dropped.head())

# Save the modified DataFrame back to a .txt file
df_dropped.to_csv(r'data/full_text_dropped.txt', sep='\t', index=False)
print("\nModified DataFrame saved to 'full_text_dropped_dropped.txt'")

# Save to CSV:
df.to_csv(r'data/full_text_dropped.csv', index=False)

          UserID            Timestamp                 Coordinates   Latitude  \
0  USER_79321756  2010-03-03T04:15:26  ÃT: 47.528139,-122.197916  47.528139   
1  USER_79321756  2010-03-03T04:55:32  ÃT: 47.528139,-122.197916  47.528139   
2  USER_79321756  2010-03-03T05:13:34  ÃT: 47.528139,-122.197916  47.528139   
3  USER_79321756  2010-03-03T05:28:02  ÃT: 47.528139,-122.197916  47.528139   
4  USER_79321756  2010-03-03T05:56:13  ÃT: 47.528139,-122.197916  47.528139   

    Longitude                                          TweetText  
0 -122.197916  RT @USER_2ff4faca: IF SHE DO IT 1 MORE TIME......  
1 -122.197916  @USER_77a4822d @USER_2ff4faca okay:) lol. Sayi...  
2 -122.197916  RT @USER_5d4d777a: YOURE A FAG FOR GETTING IN ...  
3 -122.197916  @USER_77a4822d yea ok..well answer that cheap ...  
4 -122.197916  A sprite can disappear in her mouth - lil kim ...  

DataFrame with 'Coordinates' dropped:
          UserID            Timestamp   Latitude   Longitude  \
0  USER_793217

In [6]:
def txt_to_dataframe(file_path, column_names=None):
    # Read the .txt file into a DataFrame; file is tab-separated
    df = pd.read_csv(file_path, 
                     sep='\t', 
                     header=None, 
                     names=column_names, 
                     encoding='latin1',
                     na_values=[''], # Treat empty fields as NaN
                     keep_default_na=True) # Keep the default NaN recognizers
    return df

# Load data
file_path = r'data/state_city.txt'
# Headers
column_names = ['Latitude', 'Longitude', 'City', 'State']
# Read the file into a DataFrame
df2 = txt_to_dataframe(file_path, column_names)

# Display the first few rows of the DataFrame
print(df2.head())

# Get info about the DataFrame
print(df2.info())

# If you want to save it as a different file format, like CSV:
df2.to_csv('state_city.csv', index=False)

    Latitude   Longitude                                     City State
0  13.686980  100.609881  à¸à¸£à¸¸à¸à¹à¸à¸à¸¡à¸«à¸²à¸à¸à¸£   NaN
1  13.690787  100.609060  à¸à¸£à¸¸à¸à¹à¸à¸à¸¡à¸«à¸²à¸à¸à¸£   NaN
2  13.699518  100.605383  à¸à¸£à¸¸à¸à¹à¸à¸à¸¡à¸«à¸²à¸à¸à¸£   NaN
3  13.711378  100.597386  à¸à¸£à¸¸à¸à¹à¸à¸à¸¡à¸«à¸²à¸à¸à¸£   NaN
4  13.715302  100.591672  à¸à¸£à¸¸à¸à¹à¸à¸à¸¡à¸«à¸²à¸à¸à¸£   NaN
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46322 entries, 0 to 46321
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Latitude   46322 non-null  float64
 1   Longitude  46322 non-null  float64
 2   City       42482 non-null  object 
 3   State      45966 non-null  object 
dtypes: float64(2), object(2)
memory usage: 1.4+ MB
None


# Preprocess file 'state_city.txt'

In [8]:
# Method 1: Sort by State and then by City
def sort_by_state_and_city(df):
    return df.sort_values(['State', 'City'])

# Method 2: Group by State and City, aggregating latitude and longitude
def group_by_state_and_city(df):
    return df.groupby(['State', 'City']).agg({
        'Latitude': 'mean',
        'Longitude': 'mean'
    }).reset_index()

# Apply Method 1
df_sorted = sort_by_state_and_city(df2)
print("Sorted by State and City:")
print(df_sorted)

# Apply Method 2
df_grouped = group_by_state_and_city(df2)
print("\nGrouped by State and City (with average coordinates):")
print(df_grouped)

# Save the results
df_sorted.to_csv(r'data/sorted_cities.csv', index=False)
df_grouped.to_csv(r'data/grouped_cities.csv', index=False)

# To get a list of unique cities in each state
cities_by_state = df2.groupby('State')['City'].unique()
print("\nUnique cities in each state:")
print(cities_by_state)

Sorted by State and City:
        Latitude   Longitude            City    State
8295   33.580862  -86.956455      Adamsville  Alabama
7055   32.923909  -85.949771  Alexander City  Alabama
14802  34.778611  -86.944281          Athens  Alabama
14853  34.790520  -86.975996          Athens  Alabama
14868  34.798632  -86.975188          Athens  Alabama
...          ...         ...             ...      ...
8      18.059533  -76.126374             NaN      NaN
28     21.806420 -112.150270             NaN      NaN
239    25.717681  -70.942583             NaN      NaN
1244   26.133651  -82.321223             NaN      NaN
7667   33.428679  -70.602265             NaN      NaN

[46322 rows x 4 columns]

Grouped by State and City (with average coordinates):
                                                  State  \
0                                               Alabama   
1                                               Alabama   
2                                               Alabama   
3        

In [3]:
# OpenCage API key
api_key = 'bdf3366c5d274d509c2b4d98ab202221'
geocoder = OpenCageGeocode(api_key)

# Load your Dataframe
data_path = r'data/state_city.txt'
city_state_df = pd.read_csv(data_path, sep='\t', header=None, names=['Latitude', 'Longitude', 'City', 'State'])

# Function to Reverse Geocode
def get_city_state(lat, lon):
    result = geocoder.reverse_geocode(lat, lon)
    if result and 'components' in result[0]:
        components = result[0]['components']
        city = components.get('city', '') or components.get('town', '') or components.get('village', '')
        state = components.get('state', '')
        return city, state
    return None, None

# Rate limiting the geocode call
geocode = RateLimiter(get_city_state, min_delay_seconds = 1)

# Apply the function to the DataFrame with tqdm progress bar
tqdm.pandas()  # Activates tqdm progress_apply for pandas

# Use progress_apply instead of apply to see progress
city_state_df[['City', 'State']] = city_state_df.progress_apply(
    lambda row: geocode(row['Latitude'], row['Longitude']), 
    axis=1, 
    result_type='expand'
)

# Apply the function to the DataFrame
#city_state_df[['City', 'State']] = city_state_df.apply(lambda row: geocode(row['Latitude'], row['Longitude']), axis=1, result_type='expand')

# Save or display the updated DataFrame
print(city_state_df.head())


  5%|▌         | 2502/46322 [42:54<12:31:37,  1.03s/it]


RateLimitExceededError: You have used the requests available on your plan. Please purchase more if you wish to continue: https://opencagedata.com/pricing

In [4]:
# Save the updated DataFrame
output_path = 'data/updated_state_city.csv'
city_state_df.to_csv(output_path, index=False)
print(f"Updated data saved to {output_path}")

Updated data saved to data/updated_state_city.csv
