In [1]:
import pandas as pd

# Load data from Excel
df = pd.read_csv('train_booking_data.csv')
print(df.head())


   Booking ID  User Age Preferred Age Group  Seat Number  Row  Column  \
0           1        23               18-25           12    2       6   
1           2        34               25-40           15    3       3   
2           3        45               40-60           20    4       2   
3           4        62                 60+            9    2       3   
4           5        29               25-40           18    3       6   

  Seat Type  Seat Booked  
0    Window         True  
1     Aisle         True  
2    Middle         True  
3     Aisle         True  
4    Window         True  


In [4]:
# Check for missing values
print(df.isnull().sum())

# Fill missing values or drop rows/columns
df.fillna({'User Age': df['User Age'].mean(), 'Seat Type': 'Unknown'}, inplace=True)
# or
df.dropna(inplace=True)


Booking ID             0
User Age               0
Preferred Age Group    0
Seat Number            0
Row                    0
Column                 0
Seat Type              0
Seat Booked            0
dtype: int64


In [5]:
# One-hot encoding for categorical variables
df_encoded = pd.get_dummies(df[['User Age', 'Preferred Age Group', 'Seat Type']], drop_first=True)

In [6]:
# Example: Convert 'Age Group' to numerical ranges
age_group_mapping = {'18-25': 1, '25-40': 2, '40-60': 3, '60+': 4}
df['Age Group Numeric'] = df['Preferred Age Group'].map(age_group_mapping)


In [10]:
print(df.head())

   Booking ID  User Age Preferred Age Group  Seat Number  Row  Column  \
0           1        23               18-25           12    2       6   
1           2        34               25-40           15    3       3   
2           3        45               40-60           20    4       2   
3           4        62                 60+            9    2       3   
4           5        29               25-40           18    3       6   

  Seat Type  Seat Booked  Age Group Numeric  Age Scaled  
0    Window         True                  1   -1.291475  
1     Aisle         True                  2   -0.451698  
2    Middle         True                  3    0.388079  
3     Aisle         True                  4    1.685916  
4    Window         True                  2   -0.833415  


In [9]:
from sklearn.preprocessing import StandardScaler

# Example: Scaling 'Age'
scaler = StandardScaler()
df['Age Scaled'] = scaler.fit_transform(df[['User Age']])


In [11]:
from imblearn.over_sampling import SMOTE

X = df_encoded.drop('Seat Booked', axis=1)
y = df_encoded['Seat Booked']

# Apply SMOTE to balance the dataset
smote = SMOTE()
X_res, y_res = smote.fit_resample(X, y)

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\Diya\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)


NameError: name 'X_res' is not defined

In [13]:
import numpy as np

# Define parameters for synthetic data
n_synthetic = 100  # Number of synthetic records
age_groups = ['18-25', '25-40', '40-60', '60+']
seat_types = ['Window', 'Aisle', 'Middle']

# Create synthetic records
synthetic_data = {
    'Age': np.random.randint(18, 70, size=n_synthetic),
    'Age Group': np.random.choice(age_groups, size=n_synthetic),
    'Seat Number': np.random.randint(1, 100, size=n_synthetic),
    'Row': np.random.randint(1, 20, size=n_synthetic),
    'Column': np.random.randint(1, 6, size=n_synthetic),
    'Seat Type': np.random.choice(seat_types, size=n_synthetic),
    'Seat Booked': [False] * n_synthetic
}

synthetic_df = pd.DataFrame(synthetic_data)


In [14]:
# Load original data
df_booked = pd.read_csv('train_booking_data.csv')

# Append synthetic data
df_combined = pd.concat([df_booked, synthetic_df], ignore_index=True)


In [16]:
# Handle missing values if any
df_combined.fillna({'User Age': df_combined['User Age'].mean(), 'Seat Type': 'Unknown'}, inplace=True)

# Convert categorical variables
df_encoded = pd.get_dummies(df_combined[['User Age', 'Preferred Age Group', 'Seat Type']], drop_first=True)

# Create new features if needed
age_group_mapping = {'18-25': 1, '25-40': 2, '40-60': 3, '60+': 4}
df_combined['Age Group Numeric'] = df_combined['Preferred Age Group'].map(age_group_mapping)

# Scale numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_combined['Age Scaled'] = scaler.fit_transform(df_combined[['User Age']])


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Prepare features and target variable
X = df_encoded
y = df_combined['Seat Booked']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 1.0


In [22]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load booked seat data
df_booked = pd.read_csv('train_booking_data.csv')

# Define features and target
X = df_booked[['User Age', 'Row', 'Column', 'Seat Type']]  # Features
y = df_booked['Seat Booked']  # Target

# Convert categorical features
X = pd.get_dummies(X, columns=['Seat Type'], drop_first=True)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Generate synthetic data for all possible seat configurations
rows = list(range(1, 20))
columns = list(range(1, 6))
seat_types = ['Window', 'Aisle', 'Middle']
synthetic_data = []

for row in rows:
    for column in columns:
        for seat_type in seat_types:
            synthetic_data.append({
                'User Age': 0,  # Placeholder
                'Row': row,
                'Column': column,
                'Seat Type': seat_type
            })

df_synthetic = pd.DataFrame(synthetic_data)

# Convert categorical features
df_synthetic = pd.get_dummies(df_synthetic, columns=['Seat Type'], drop_first=True)

# Predict probabilities
df_synthetic_scaled = scaler.transform(df_synthetic)
probabilities = model.predict_proba(df_synthetic_scaled)

# Handle case where there is only one column
if probabilities.shape[1] == 1:
    df_synthetic['Booking Probability'] = probabilities
else:
    df_synthetic['Booking Probability'] = probabilities[:, 1]

# Recommend seats based on probability
recommended_seats = df_synthetic.sort_values(by='Booking Probability', ascending=False)
print(recommended_seats.head())


     User Age  Row  Column  Seat Type_Middle  Seat Type_Window  \
0           0    1       1             False              True   
188         0   13       3              True             False   
194         0   13       5              True             False   
193         0   13       5             False             False   
192         0   13       5             False              True   

     Booking Probability  
0                    1.0  
188                  1.0  
194                  1.0  
193                  1.0  
192                  1.0  


In [24]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Example of the trained model and scaler
# model = RandomForestClassifier()
# scaler = StandardScaler()

# Function to recommend seats based on user's age and preferred age group
def recommend_seats(user_age, preferred_age_group):
    # Generate synthetic data for seat recommendations
    rows = list(range(1, 20))
    columns = list(range(1, 6))
    seat_types = ['Window', 'Aisle', 'Middle']
    synthetic_data = []

    for row in rows:
        for column in columns:
            for seat_type in seat_types:
                synthetic_data.append({
                    'User Age': user_age,
                    'Row': row,
                    'Column': column,
                    'Seat Type': seat_type
                })

    df_synthetic = pd.DataFrame(synthetic_data)

    # Convert categorical features
    df_synthetic = pd.get_dummies(df_synthetic, columns=['Seat Type'], drop_first=True)

    # Scale features
    df_synthetic_scaled = scaler.transform(df_synthetic)

    # Predict probabilities
    probabilities = model.predict_proba(df_synthetic_scaled)

    # Handle case where there is only one column
    if probabilities.shape[1] == 1:
        df_synthetic['Booking Probability'] = probabilities
    else:
        df_synthetic['Booking Probability'] = probabilities[:, 1]

    # Filter recommendations based on preferred age group
    age_group_mapping = {
        '18-25': (18, 25),
        '25-40': (25, 40),
        '40-60': (40, 60),
        '60+': (60, 150)  # assuming age > 60
    }

    min_age, max_age = age_group_mapping.get(preferred_age_group, (0, 150))
    filtered_recommendations = df_synthetic[
        (df_synthetic['User Age'] >= min_age) & (df_synthetic['User Age'] <= max_age)
    ]

    # Sort by booking probability
    recommended_seats = filtered_recommendations.sort_values(by='Booking Probability', ascending=False)

    return recommended_seats

# Example usage
user_age = 29
preferred_age_group = '25-40'
recommended_seats = recommend_seats(user_age, preferred_age_group)
print(recommended_seats)


     User Age  Row  Column  Seat Type_Middle  Seat Type_Window  \
0          29    1       1             False              True   
188        29   13       3              True             False   
194        29   13       5              True             False   
193        29   13       5             False             False   
192        29   13       5             False              True   
..        ...  ...     ...               ...               ...   
101        29    7       4              True             False   
102        29    7       5             False              True   
103        29    7       5             False             False   
104        29    7       5              True             False   
284        29   19       5              True             False   

     Booking Probability  
0                    1.0  
188                  1.0  
194                  1.0  
193                  1.0  
192                  1.0  
..                   ...  
101               

In [25]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Example of the trained model and scaler
# model = RandomForestClassifier()
# scaler = StandardScaler()

def recommend_preferred_seats(user_age, preferred_age_group):
    # Generate synthetic data for seat recommendations
    rows = list(range(1, 20))
    columns = list(range(1, 6))
    seat_types = ['Window', 'Aisle', 'Middle']
    synthetic_data = []

    for row in rows:
        for column in columns:
            for seat_type in seat_types:
                synthetic_data.append({
                    'User Age': user_age,
                    'Row': row,
                    'Column': column,
                    'Seat Type': seat_type
                })

    df_synthetic = pd.DataFrame(synthetic_data)

    # Convert categorical features
    df_synthetic = pd.get_dummies(df_synthetic, columns=['Seat Type'], drop_first=True)

    # Scale features
    df_synthetic_scaled = scaler.transform(df_synthetic)

    # Predict probabilities
    probabilities = model.predict_proba(df_synthetic_scaled)

    # Handle case where there is only one column
    if probabilities.shape[1] == 1:
        df_synthetic['Booking Probability'] = probabilities
    else:
        df_synthetic['Booking Probability'] = probabilities[:, 1]

    # Filter recommendations based on preferred age group
    age_group_mapping = {
        '18-25': (18, 25),
        '25-40': (25, 40),
        '40-60': (40, 60),
        '60+': (60, 150)  # assuming age > 60
    }

    min_age, max_age = age_group_mapping.get(preferred_age_group, (0, 150))
    filtered_recommendations = df_synthetic[
        (df_synthetic['User Age'] >= min_age) & (df_synthetic['User Age'] <= max_age)
    ]

    # Sort by booking probability
    recommended_seats = filtered_recommendations.sort_values(by='Booking Probability', ascending=False)

    return recommended_seats[['Row', 'Column', 'Seat Type', 'Booking Probability']]

# Example usage
user_age = 29
preferred_age_group = '25-40'
preferred_seats = recommend_preferred_seats(user_age, preferred_age_group)
print(preferred_seats.head())


KeyError: "['Seat Type'] not in index"