In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


train_data = pd.read_csv('titanic_train.csv')
test_data = pd.read_csv('titanic_test.csv')


def preprocess_data(data):
    data = data[['Survived', 'Pclass', 'Sex', 'Age', 'Fare']]
    data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})  
    data = data.dropna()
    X = data.drop('Survived', axis=1)
    y = data['Survived']
    return X, y

def preprocess_test_data(data):
    data = data[['Pclass', 'Sex', 'Age', 'Fare']]
    data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})  
    data = data.dropna()
    return data

X_train, y_train = preprocess_data(train_data)
X_test = preprocess_test_data(test_data)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

class LinearRegression:
    def __init__(self):
        self.weights = None

    def fit(self, X, y):
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
        self.weights = np.linalg.inv(X.T @ X) @ X.T @ y

    def predict(self, X):
       
        X = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1)
        return X @ self.weights 

    def MAE(self, y_true, y_pred):
        return np.mean(np.abs(y_true - y_pred))

    def MAPE(self, y_true, y_pred):
        return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    def r2_score(self, y_true, y_pred):
        ss_total = np.sum((y_true - np.mean(y_true))**2)
        ss_res = np.sum((y_true - y_pred)**2)
        return 1 - (ss_res / ss_total)


model = LinearRegression()
model.fit(X_train, y_train)


y_pred_train = model.predict(X_train)
print(f'Training MAE: {model.MAE(y_train, y_pred_train)}')
print(f'Training MAPE: {model.MAPE(y_train, y_pred_train)}')
print(f'Training R^2: {model.r2_score(y_train, y_pred_train)}')


print(f'Weights: {model.weights}')


Training MAE: 0.2980211802129321
Training MAPE: inf
Training R^2: 0.39020129996439834
Weights: [ 0.40616246 -0.16788205  0.23051673 -0.07875988  0.00359625]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})  # Convert 'Sex' from categorical to numerical
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Sex'] = data['Sex'].map({'male': 0, 'female': 1})  # Convert 'Sex' from categorical to numerical


In [6]:
train_data = pd.read_csv('titanic_train.csv')
print(train_data.head())

   PassengerId  Pclass                                               Name  \
0            1       3                            Braund, Mr. Owen Harris   
1            2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2            3       3                             Heikkinen, Miss. Laina   
3            4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4            5       3                           Allen, Mr. William Henry   

      Sex   Age  SibSp  Parch            Ticket     Fare Cabin Embarked  \
0    male  22.0      1      0         A/5 21171   7.2500   NaN        S   
1  female  38.0      1      0          PC 17599  71.2833   C85        C   
2  female  26.0      0      0  STON/O2. 3101282   7.9250   NaN        S   
3  female  35.0      1      0            113803  53.1000  C123        S   
4    male  35.0      0      0            373450   8.0500   NaN        S   

   Survived  
0         0  
1         1  
2         1  
3         1  
4         0  
