In [2]:
import pandas as pd

# Load the data
df = pd.read_csv('flightdata.csv')

# Display basic information about the DataFrame
print("Original DataFrame Info:")
df.info()

Original DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123984 entries, 0 to 123983
Data columns (total 31 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Unnamed: 0      123984 non-null  int64  
 1   year            123984 non-null  int64  
 2   month           123984 non-null  int64  
 3   day             123984 non-null  int64  
 4   dep_time        121788 non-null  float64
 5   sched_dep_time  123984 non-null  int64  
 6   dep_delay       121788 non-null  float64
 7   arr_time        121651 non-null  float64
 8   sched_arr_time  123984 non-null  int64  
 9   arr_delay       121427 non-null  float64
 10  carrier         123984 non-null  object 
 11  flight          123984 non-null  int64  
 12  tailnum         123003 non-null  object 
 13  origin          123984 non-null  object 
 14  dest            123984 non-null  object 
 15  air_time        121427 non-null  float64
 16  distance        123984 non-null

In [3]:
# Display the shape of the original DataFrame
print("Original DataFrame Shape:", df.shape)

Original DataFrame Shape: (123984, 31)


In [4]:
# Display whether there are any missing values in the original DataFrame
print("Original DataFrame Has Missing Values:", df.isnull().values.any())

Original DataFrame Has Missing Values: True


In [5]:
# Display whether there are any missing values in the original DataFrame
print("Original DataFrame Has Missing Values:", df.isnull().values.any())

# Display the count of missing values for each column in the original DataFrame
print("Original DataFrame Missing Values per Column:")
print(df.isnull().sum())


Original DataFrame Has Missing Values: True
Original DataFrame Missing Values per Column:
Unnamed: 0           0
year                 0
month                0
day                  0
dep_time          2196
sched_dep_time       0
dep_delay         2196
arr_time          2333
sched_arr_time       0
arr_delay         2557
carrier              0
flight               0
tailnum            981
origin               0
dest                 0
air_time          2557
distance             0
hour                 0
minute               0
time_hour            0
flightb              0
totflight            0
avgdelay            12
airline              0
airport              0
lat                  0
lon                  0
alt                  0
tz                   0
dst                  0
tzone                0
dtype: int64


In [6]:
from sklearn.impute import SimpleImputer

# Columns to impute
columns_to_impute = ['dep_time', 'dep_delay', 'arr_time', 'arr_delay', 'tailnum', 'air_time', 'avgdelay']

# Replace 'mean' with 'most_frequent' for non-numeric data
imputer = SimpleImputer(strategy='most_frequent')

# Fit and transform the imputer on the specified columns
df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])

In [7]:
# Display the shape and missing values of the DataFrame after dropping columns
print("\nDataFrame Info After Column imputation:")
print(df.info())


DataFrame Info After Column imputation:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123984 entries, 0 to 123983
Data columns (total 31 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Unnamed: 0      123984 non-null  int64  
 1   year            123984 non-null  int64  
 2   month           123984 non-null  int64  
 3   day             123984 non-null  int64  
 4   dep_time        123984 non-null  object 
 5   sched_dep_time  123984 non-null  int64  
 6   dep_delay       123984 non-null  object 
 7   arr_time        123984 non-null  object 
 8   sched_arr_time  123984 non-null  int64  
 9   arr_delay       123984 non-null  object 
 10  carrier         123984 non-null  object 
 11  flight          123984 non-null  int64  
 12  tailnum         123984 non-null  object 
 13  origin          123984 non-null  object 
 14  dest            123984 non-null  object 
 15  air_time        123984 non-null  object 
 16  distance       

In [8]:
print("\nmissing values of the DataFrame after dropping columns:", df.isnull().sum())


missing values of the DataFrame after dropping columns: Unnamed: 0        0
year              0
month             0
day               0
dep_time          0
sched_dep_time    0
dep_delay         0
arr_time          0
sched_arr_time    0
arr_delay         0
carrier           0
flight            0
tailnum           0
origin            0
dest              0
air_time          0
distance          0
hour              0
minute            0
time_hour         0
flightb           0
totflight         0
avgdelay          0
airline           0
airport           0
lat               0
lon               0
alt               0
tz                0
dst               0
tzone             0
dtype: int64


In [9]:
print("\nDataFrame Shape After Column imputation:", df.shape)


DataFrame Shape After Column imputation: (123984, 31)


In [10]:
df.head()

Unnamed: 0.1,Unnamed: 0,year,month,day,dep_time,sched_dep_time,dep_delay,arr_time,sched_arr_time,arr_delay,...,totflight,avgdelay,airline,airport,lat,lon,alt,tz,dst,tzone
0,1,2013,1,1,542.0,540,2.0,923.0,850,33.0,...,91,2.101124,American Airlines Inc.,Miami Intl,25.79325,-80.290556,8,-5,A,America/New_York
1,2,2013,1,1,554.0,600,-6.0,812.0,837,-25.0,...,268,-0.708812,Delta Air Lines Inc.,Hartsfield Jackson Atlanta Intl,33.636719,-84.428067,1026,-5,A,America/New_York
2,3,2013,1,1,554.0,558,-4.0,740.0,728,12.0,...,1,-4.0,United Air Lines Inc.,Chicago Ohare Intl,41.978603,-87.904842,668,-6,A,America/Chicago
3,4,2013,1,1,555.0,600,-5.0,913.0,854,19.0,...,168,0.789157,JetBlue Airways,Fort Lauderdale Hollywood Intl,26.072583,-80.15275,9,-5,A,America/New_York
4,5,2013,1,1,557.0,600,-3.0,838.0,846,-8.0,...,162,-0.440994,JetBlue Airways,Orlando Intl,28.429394,-81.308994,96,-5,A,America/New_York


In [11]:
from sklearn.model_selection import train_test_split

# Specify the features (X) and the target variable (y)
X = df.drop(['arr_time'], axis=1)  # Adjust with your target variable
y = df['arr_time']  # Replace 'arrival_time' with your target variable

# Split the data into training and testing sets (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the resulting sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (99187, 30)
X_test shape: (24797, 30)
y_train shape: (99187,)
y_test shape: (24797,)


In [12]:
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [13]:
print("Unique values in y_train:", y_train.unique())


Unique values in y_train: [33.0 2253.0 2224.0 ... 315.0 401.0 623.0]


In [14]:
# @title
print(X_train.dtypes)
print(X_train.isnull().sum())

print(y_train.dtypes)
print(y_train.isnull().sum())



Unnamed: 0          int64
year                int64
month               int64
day                 int64
dep_time           object
sched_dep_time      int64
dep_delay          object
sched_arr_time      int64
arr_delay          object
carrier            object
flight              int64
tailnum            object
origin             object
dest               object
air_time           object
distance            int64
hour                int64
minute              int64
time_hour          object
flightb            object
totflight           int64
avgdelay           object
airline            object
airport            object
lat               float64
lon               float64
alt                 int64
tz                  int64
dst                object
tzone              object
dtype: object
Unnamed: 0        0
year              0
month             0
day               0
dep_time          0
sched_dep_time    0
dep_delay         0
sched_arr_time    0
arr_delay         0
carrier           0
flight

In [15]:
# Identify unique values in columns with 'object' data type in X_train
object_columns_X = X_train.select_dtypes(include=['object']).columns
for column in object_columns_X:
    print(f"Unique values in {column}: {X_train[column].unique()}")

# Check the unique values in y_train
print("Unique values in y_train:", y_train.unique())


Unique values in dep_time: [2333.0 1924.0 2123.0 ... 105.0 8.0 149.0]
Unique values in dep_delay: [38.0 -1.0 43.0 -2.0 -5.0 11.0 35.0 1.0 2.0 0.0 -11.0 -6.0 52.0 26.0 -3.0
 6.0 -4.0 -7.0 7.0 4.0 10.0 17.0 9.0 -8.0 18.0 -10.0 99.0 32.0 13.0 96.0
 68.0 15.0 164.0 5.0 31.0 24.0 3.0 70.0 14.0 69.0 54.0 176.0 -9.0 45.0
 165.0 114.0 406.0 33.0 39.0 57.0 65.0 46.0 12.0 16.0 -13.0 8.0 19.0 28.0
 37.0 42.0 84.0 87.0 249.0 111.0 62.0 29.0 -12.0 22.0 40.0 105.0 51.0 59.0
 207.0 -14.0 83.0 193.0 128.0 34.0 30.0 -15.0 27.0 112.0 64.0 93.0 90.0
 121.0 129.0 108.0 71.0 20.0 92.0 23.0 94.0 123.0 79.0 81.0 101.0 89.0
 216.0 86.0 73.0 248.0 117.0 163.0 134.0 47.0 225.0 48.0 56.0 55.0 95.0
 97.0 36.0 74.0 267.0 21.0 41.0 175.0 167.0 209.0 149.0 75.0 158.0 58.0
 53.0 153.0 44.0 159.0 67.0 190.0 194.0 25.0 124.0 125.0 116.0 115.0 66.0
 50.0 78.0 169.0 181.0 139.0 136.0 77.0 251.0 184.0 60.0 88.0 106.0 150.0
 201.0 63.0 118.0 133.0 381.0 131.0 49.0 137.0 72.0 110.0 287.0 -17.0
 143.0 98.0 301.0 80.0 354.0 8

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

# Check for and handle missing values
X_train = X_train.dropna()
y_train = y_train[X_train.index]  # Adjust y_train accordingly

# Convert 'object' columns to numerical values
X_train['dep_time'] = pd.to_numeric(X_train['dep_time'], errors='coerce')
X_train['dep_delay'] = pd.to_numeric(X_train['dep_delay'], errors='coerce')
X_train['arr_delay'] = pd.to_numeric(X_train['arr_delay'], errors='coerce')
X_train['air_time'] = pd.to_numeric(X_train['air_time'], errors='coerce')

# Drop remaining missing values
X_train = X_train.dropna()

# Ensure all features are numeric
X_train = pd.get_dummies(X_train)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Random Forest with pipeline
random_forest_model = make_pipeline(StandardScaler(), RandomForestRegressor())
random_forest_scores = cross_val_score(random_forest_model, X_train_scaled, y_train, cv=5, scoring='r2')

print("Random Forest Cross-Validation Scores:", random_forest_scores)
print("Random Forest Mean R^2:", random_forest_scores.mean())


NameError: name 'X_train' is not defined