In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Step 1: Load the data
train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

# Step 2: Drop the 'Name' column (which is non-numeric and not needed)
train = train.drop(columns=['Name'])
test = test.drop(columns=['Name'])

# Step 3: Handle missing values
# Separate numeric and categorical columns (after dropping 'Name')
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = train.select_dtypes(include=['object']).columns

# For numeric columns, use mean strategy
numeric_imputer = SimpleImputer(strategy='mean')
train[numeric_cols] = numeric_imputer.fit_transform(train[numeric_cols])
test[numeric_cols] = numeric_imputer.transform(test[numeric_cols])

# For categorical columns, use most_frequent strategy
categorical_imputer = SimpleImputer(strategy='most_frequent')
train[categorical_cols] = categorical_imputer.fit_transform(train[categorical_cols])
test[categorical_cols] = categorical_imputer.transform(test[categorical_cols])

# Step 4: Encode categorical columns
cat_features = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
le = LabelEncoder()

# Fit and transform on train
for col in cat_features:
    train[col] = le.fit_transform(train[col].astype(str))  # Fit and transform on training data

    # Handle unseen categories during transformation by adding missing categories to the training set
    missing_categories = np.setdiff1d(test[col].unique(), le.classes_)
    le.classes_ = np.concatenate([le.classes_, missing_categories])

    test[col] = le.transform(test[col].astype(str))  # Transform on test data using the same fit

# Step 5: Split into features and target
X = train.drop(columns=['Transported', 'PassengerId'])
y = train['Transported']

# Step 6: Train-test split (using a validation set)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Initialize XGBoost Model
model = XGBClassifier(
    n_estimators=1000,  # Number of trees
    learning_rate=0.05,  # Learning rate
    max_depth=6,  # Depth of each tree
    subsample=0.8,  # Proportion of samples used per tree
    colsample_bytree=0.8,  # Proportion of features used per tree
    random_state=42,
    enable_categorical=True  # Enable categorical handling
)

# Step 8: Train the model
model.fit(X_train, y_train)

# Step 9: Model Evaluation
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')

# Step 10: Predictions on the test set
y_test_pred = model.predict(test.drop(columns=['PassengerId']))

# Step 11: Prepare the submission file
submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Transported': y_test_pred
})

# Step 12: Save the submission to CSV
submission.to_csv('submission.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinerRegression
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score

#Load the dataset

df = pd.read_csv("train.csv")
df = df[['GrLivArea','BedroomAbGr','FullBath','saleprice']]
df['TotalBath'] = df['FullBath'] + (0.5 * df['HalfBath'])
df = df[['GrLivArea','BedroomAbGl','TotalBath','SalePrice']]
df.dropna(inplace=True)

#Define Features and Target

x = df[['GrLivArea','BedroomAbGr','TotalBath','SalePrice']]
y = df['saleprice']

#Split the Data

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

#Train the Linear Regression Model

model = LinearRegression()
model.fit(x_train,Y_train)

#Make Predictions

y_pred = model.predict(x_test)

#Evaluated the model

mae = mean_absolute_error(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
rmse = (f"Root Mean Squared Error: {rmse}")
r2 = r2_score(y_test,y_pred)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R^2 score: {r2}")

#Visualizing the predictions

plt.scatter(y_test,y_pred,alpha=0.5)
plt.xlabel("Actual Prices")
plt.ylabel("predicted prices")
plt.title("Actual vs Predicted House Prices")
plt.show()

