## M32895 Portfolio Part 2 – UP2089158

## Exercise 1

In [None]:
# Exercise 1: Create and analyze a random DataFrame
import pandas as pd
import numpy as np

# Generate a DataFrame with random numbers
df = pd.DataFrame(np.random.rand(6, 4), columns=['A', 'B', 'C', 'D'], index=['R1', 'R2', 'R3', 'R4', 'R5', 'R6'])

print("DataFrame:")
print(df)

# Summary of the DataFrame
print("\nShape of DataFrame:", df.shape)
print("\nData Summary:")
print(df.describe())


## Exercise 2

In [None]:
# Exercise 2: Boxplots of Iris dataset using matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

# Load Iris dataset
iris = sns.load_dataset("iris")

# Create boxplots for each numeric column
iris.drop(columns=['species']).plot(kind='box', subplots=True, layout=(2,2), figsize=(10,6), title='Iris Boxplots')
plt.tight_layout()
plt.show()

## Exercise 3

In [None]:
# Exercise 3: Sunburst chart of tips data using plotly.express
import plotly.express as px
import seaborn as sns

# Load Tips dataset
tips = sns.load_dataset("tips")

# Sunburst chart
fig = px.sunburst(tips, path=['sex', 'day', 'time'], values='tip', title='Sunburst of Tips by Gender, Day, and Time')
fig.show()


## Exercise 4

In [None]:
# Exercise 4: Linear regression on restaurant tips
from sklearn.linear_model import LinearRegression
import numpy as np

# Reshape input for sklearn
X = tips[['total_bill']].values
y = tips['tip'].values

# Train model
model = LinearRegression()
model.fit(X, y)

# Print model parameters
print(f"Coefficient: {model.coef_[0]}")
print(f"Intercept: {model.intercept_}")

# Predict tip for a bill of $30
bill_example = np.array([[30]])
predicted_tip = model.predict(bill_example)
print(f"Predicted tip for a $30 bill: ${predicted_tip[0]:.2f}")

## Exercise 5

In [None]:
# Exercise 5: Titanic Decision Tree with categorical features
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Load Titanic dataset
titanic = sns.load_dataset('titanic')

# Drop rows with missing values in required columns
titanic_clean = titanic.dropna(subset=['sex', 'embarked', 'age', 'fare'])

# Define features and target
features = ['sex', 'sibsp', 'parch', 'embarked', 'age', 'fare']
target = 'survived'
X = titanic_clean[features]
y = titanic_clean[target]

# Define categorical and numerical columns
categorical_features = ['sex', 'embarked']
numeric_features = ['sibsp', 'parch', 'age', 'fare']

# One-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(), categorical_features)],
    remainder='passthrough'
)

# Create pipeline with Decision Tree
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Predict outcome for a sample passenger
sample_passenger = pd.DataFrame([{
    'sex': 'male',
    'sibsp': 1,
    'parch': 0,
    'embarked': 'S',
    'age': 30,
    'fare': 75.0
}])
sample_prediction = pipeline.predict(sample_passenger)
print(f"Prediction for sample passenger: {'Survived' if sample_prediction[0] == 1 else 'Did not survive'}")

importances = pipeline.named_steps['classifier'].feature_importances_
encoded_cat_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
feature_names = list(encoded_cat_names) + numeric_features

plt.figure(figsize=(10, 6))
plt.barh(feature_names, importances)
plt.xlabel('Importance')
plt.title('Feature Importances – Titanic Decision Tree Model')
plt.tight_layout()
plt.show()
