In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
warnings.filterwarnings('ignore')  # Ignore warnings for clean output

# Load Titanic dataset from the given URL
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
titanic_data = pd.read_csv(url)

# --- Print dataset info ---
print("🔹 First 5 rows of the dataset:")
print(titanic_data.head())  # Show first 5 rows

print("\n🔹 Missing values in each column:")
print(titanic_data.isnull().sum())  # Show how many missing values in each column

# Drop rows where 'Survived' is missing (although in this dataset it usually isn't)
titanic_data = titanic_data.dropna(subset=['Survived'])

# Select input features (X) and target/output variable (y)
'''
Pclass: Passenger class (1st, 2nd, 3rd)

Sex: Gender (string, needs conversion)

Age: Passenger age

SibSp: # of siblings/spouses aboard

Parch: # of parents/children aboard

Fare: Ticket fare
'''
X = titanic_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = titanic_data['Survived']  # Target variable (0 = Died, 1 = Survived)

# Convert 'Sex' column from string to numerical: male -> 1, female -> 0
X.loc[:, 'Sex'] = X['Sex'].map({'female': 0, 'male': 1})

# Fill missing values in 'Age' column with the median age
X.loc[:, 'Age'].fillna(X['Age'].median(), inplace=True)

# Split the data into training (80%) and test (20%) sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Create a Random Forest Classifier with 100 trees
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model using accuracy and detailed classification report
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the accuracy and classification report
print(f"\nAccuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_rep)

# Pick a single passenger from test data to make a sample prediction
sample = X_test.iloc[0:1]  # First test passenger
prediction = rf_classifier.predict(sample)  # Predict survival

# Convert passenger's feature values to a dictionary for readability
sample_dict = sample.iloc[0].to_dict()

# Print passenger features and prediction result
print(f"\nSample Passenger: {sample_dict}")
print(f"Predicted Survival: {'Survived' if prediction[0] == 1 else 'Did Not Survive'}")


🔹 First 5 rows of the dataset:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            3734

In [9]:
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the California housing dataset
california_housing = fetch_california_housing()

# Convert the data into a pandas DataFrame
california_data = pd.DataFrame(california_housing.data, columns=california_housing.feature_names)

# Add the target column (Median house value) to the DataFrame
california_data['MEDV'] = california_housing.target

# ----- Sample data preview -----
print("🔹 First 5 rows of the dataset:")
print(california_data.head())

# Separate the features (X) and the target (y)
X = california_data.drop('MEDV', axis=1)  # Features
y = california_data['MEDV']               # Target variable (median house value)

# Split the dataset into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize the Random Forest Regressor with 100 trees
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model using the training data
rf_regressor.fit(X_train, y_train)

# Predict house prices using the test data
y_pred = rf_regressor.predict(X_test)

# Evaluate model performance using Mean Squared Error and R² Score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Predict the house price for a single test sample
single_data = X_test.iloc[0].values.reshape(1, -1)  # Reshape to match model input
predicted_value = rf_regressor.predict(single_data)

# Print prediction results for the single data point
print(f"\Predicted Value: {predicted_value[0]:.2f}")
print(f"Actual Value: {y_test.iloc[0]:.2f}")

# Print model evaluation metrics
print(f"\nMean Squared Error: {mse:.2f}")
print(f"R-squared Score: {r2:.2f}")


🔹 First 5 rows of the dataset:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude   MEDV  
0    -122.23  4.526  
1    -122.22  3.585  
2    -122.24  3.521  
3    -122.25  3.413  
4    -122.25  3.422  
\Predicted Value: 0.51
Actual Value: 0.48

Mean Squared Error: 0.26
R-squared Score: 0.81
