# Imports and Modules

In [10]:
import pandas as pd
import numpy as np
import csv
from tqdm import tqdm

# Preprocess File 'full_text.txt'

In [19]:
def txt_to_dataframe(file_path, column_names=None):
    # Read the .txt file into a DataFrame
    # File is tab-seperated
    df = pd.read_csv(file_path, sep='\t', header=None, names=column_names, encoding='latin1')
    return df

# Load data:
file_path = r'data/full_text.txt'

# Headers
column_names = ['UserID', 'Timestamp', 'Coordinates', 'Latitude', 'Longitude', 'TweetText']

# Read the file into a DataFrame
df = txt_to_dataframe(file_path, column_names)

# Display the first few rows of the DataFrame
print(df.head())

# Drop a column
df_dropped = df.drop(columns=['Coordinates'])
print("\nDataFrame with 'Coordinates' dropped:")
print(df_dropped.head())

# Save the modified DataFrame back to a .txt file
#df_dropped.to_csv(r'data/full_text_dropped.txt', sep='\t', index=False)
#print("\nModified DataFrame saved to 'full_text_dropped_dropped.txt'")

# Save to CSV:
df_dropped.to_csv(r'data/full_text/full_text_dropped.csv', index=False)

print(df_dropped.head())

          UserID            Timestamp                 Coordinates   Latitude  \
0  USER_79321756  2010-03-03T04:15:26  ÃT: 47.528139,-122.197916  47.528139   
1  USER_79321756  2010-03-03T04:55:32  ÃT: 47.528139,-122.197916  47.528139   
2  USER_79321756  2010-03-03T05:13:34  ÃT: 47.528139,-122.197916  47.528139   
3  USER_79321756  2010-03-03T05:28:02  ÃT: 47.528139,-122.197916  47.528139   
4  USER_79321756  2010-03-03T05:56:13  ÃT: 47.528139,-122.197916  47.528139   

    Longitude                                          TweetText  
0 -122.197916  RT @USER_2ff4faca: IF SHE DO IT 1 MORE TIME......  
1 -122.197916  @USER_77a4822d @USER_2ff4faca okay:) lol. Sayi...  
2 -122.197916  RT @USER_5d4d777a: YOURE A FAG FOR GETTING IN ...  
3 -122.197916  @USER_77a4822d yea ok..well answer that cheap ...  
4 -122.197916  A sprite can disappear in her mouth - lil kim ...  

DataFrame with 'Coordinates' dropped:
          UserID            Timestamp   Latitude   Longitude  \
0  USER_793217

## Extract Known Coordinates Based on 'state_city'

In [22]:
# Reading previously sorted df from state_city
df_usa = pd.read_csv(r'data/state_city/usa_df_rounded.csv')

# Current df
df_dropped = pd.read_csv(r'data/full_text/full_text_dropped.csv')

# Function to find the closest known location
def find_closest_location(lat, lon):
    distances = np.sqrt((df_usa['Latitude'] - lat)**2 + (df_usa['Longitude'] - lon)**2)
    closest_idx = distances.idxmin()
    return df_usa.loc[closest_idx, 'State'], df_usa.loc[closest_idx, 'City']

# Apply the function to each row in df
df_dropped[['Closest_State', 'Closest_City']] = df_dropped.apply(
    lambda row: find_closest_location(row['Latitude'], row['Longitude']), axis=1, result_type='expand'
)

# Sort df_dropped based on the Closest_State and Closest_City
df_sorted = df_dropped.sort_values(['Closest_State', 'Closest_City'])

# Save the sorted dataframe to a new CSV
df_sorted.to_csv(r'data/full_text/df_full_sorted.csv', index=False)

print(df_sorted.head())


               UserID            Timestamp   Latitude  Longitude  \
358472  USER_28e6d0a1  2010-03-04T02:50:29  33.580862 -86.956455   
358480  USER_28e6d0a1  2010-03-04T04:33:59  33.580862 -86.956455   
358481  USER_28e6d0a1  2010-03-04T04:45:31  33.580862 -86.956455   
358484  USER_28e6d0a1  2010-03-05T00:44:08  33.580862 -86.956455   
358485  USER_28e6d0a1  2010-03-05T00:45:50  33.580862 -86.956455   

                                                TweetText Closest_State  \
358472    @USER_148a266e I gotta get you re-added to BBM!       Alabama   
358480            @USER_5c07acb0 @USER_9334f9b7 ahhh yes!       Alabama   
358481                 @USER_2594d45f An old locksmith...       Alabama   
358484  RT @USER_f1966b04: They are shooting at pentag...       Alabama   
358485  @USER_80024f73 as a matter of fact... I wanna ...       Alabama   

       Closest_City  
358472   Adamsville  
358480   Adamsville  
358481   Adamsville  
358484   Adamsville  
358485   Adamsville  
