# Data Cleaning and Feature Engineering -ET_project




### Libraries

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

In [21]:
# Load the dataset
df = pd.read_csv('/Users/miguelaraujo/code/MrAraujo99/ET_Predictor/raw_data/UFO sightings/scrubbed.csv', low_memory=False)
df.columns = df.columns.str.strip()

In [22]:
# Convert 'latitude' and 'longitude' to numeric types, coerce errors
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

In [23]:
# Convert 'datetime' to datetime object, coerce errors
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')

In [24]:
# Extract day, hour, and week from the datetime column
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['week'] = df['datetime'].dt.isocalendar().week

In [25]:
# Function to determine season
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'

df['month'] = df['datetime'].dt.month
df['season'] = df['month'].apply(get_season)

In [26]:
# Drop rows where datetime could not be converted
df.dropna(subset=['datetime'], inplace=True)

In [27]:
# Drop original datetime and unused columns
df = df.drop(columns=['datetime', 'month', 'duration (seconds)', 'duration (hours/min)', 'comments', 'date posted'])

In [28]:
df[['latitude','longitude']]

Unnamed: 0,latitude,longitude
0,29.883056,-97.941111
1,29.384210,-98.581082
2,53.200000,-2.916667
3,28.978333,-96.645833
4,21.418056,-157.803611
...,...,...
80327,36.165833,-86.784444
80328,43.613611,-116.202500
80329,38.297222,-122.284444
80330,38.901111,-77.265556


In [29]:
# Check for remaining NaNs
df.dtypes

city          object
state         object
country       object
shape         object
latitude     float64
longitude    float64
day          float64
hour         float64
week          UInt32
season        object
dtype: object

In [30]:
# Define the features (X) and target (y)
X = df.drop(['latitude', 'longitude'], axis=1)
y = df[['latitude', 'longitude']]

In [49]:
# Remove rows where the target variable has NaN values
mask = ~y.isna().any(axis=1)
X = X[mask]
y = y[mask]

In [50]:
#y.loc[pd.isna(y["latitude"]), :].index

In [32]:
df.columns

Index(['city', 'state', 'country', 'shape', 'latitude', 'longitude', 'day',
       'hour', 'week', 'season'],
      dtype='object')

In [33]:
# Identify numeric and categorical columns
numeric_features = ['day', 'hour', 'week']
categorical_features = ['city', 'state', 'country', 'shape', 'season']

In [34]:
# Ensure all required columns are present
missing_columns = set(numeric_features + categorical_features) - set(X.columns)
if missing_columns:
    print(f"Missing columns in the DataFrame: {missing_columns}")
else:
    print("All columns are present in the DataFrame.")

All columns are present in the DataFrame.


In [35]:
# Define the preprocessing steps
numeric_transformer = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
)

categorical_transformer = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore')
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [36]:
# Define the pipeline with the preprocessor and Gradient Boosting Regressor
pipeline = make_pipeline(
    preprocessor,
    GradientBoostingRegressor(n_estimators=100, random_state=42)
)

In [37]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Fitting on the training data and predicting

In [38]:
df[df.columns[df.isnull().any()]].isnull().sum()

state       5686
country     9533
shape       1891
latitude       1
dtype: int64

In [39]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((63710, 8), (15928, 8), (63710, 2), (15928, 2))

In [40]:
y.isna().sum()

latitude     1
longitude    0
dtype: int64

In [46]:
# Remove rows where y_train has NaN values
mask = ~y_train.isna().any(axis=1)
X_train_cleaned = X_train[mask]
y_train_cleaned = y_train[mask]

# Confirm that NaN values are removed
print(y_train_cleaned.isna().sum())

latitude     0
longitude    0
dtype: int64


In [48]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

ValueError: Input y contains NaN.