In [None]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Define dimensions
num_rows = 6
num_cols = 4

# Generate random data
data = np.random.rand(num_rows, num_cols)

# Create row and column labels
row_labels = [f'Row{i+1}' for i in range(num_rows)]
col_labels = [f'Col{j+1}' for j in range(num_cols)]

# Create DataFrame
df = pd.DataFrame(data, index=row_labels, columns=col_labels)

# Display the DataFrame
print("DataFrame:")
print(df)

# Summary (info)
print("\nSummary (info):")
print(df.info())

# Shape
print("\nShape of DataFrame:")
print(df.shape)

# Statistics
print("\nDescriptive Statistics:")
print(df.describe())

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Load the Iris dataset
iris = sns.load_dataset('iris')

# Display the first few rows (optional)
print("First few rows of the Iris dataset:")
print(iris.head())

# List of numerical columns
numerical_cols = iris.select_dtypes(include=['float']).columns

# Create boxplots for each numerical column
plt.figure(figsize=(12, 8))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 2, i)
    plt.boxplot(iris[col])
    plt.title(f'Boxplot of {col}')
    plt.ylabel(col)

# Adjust layout and display
plt.tight_layout()
plt.show()

In [None]:
import seaborn as sns
import pandas as pd
import plotly.express as px

# Load the Tips dataset
tips = sns.load_dataset('tips')

# Group the data by sex, day, and time, then sum the tips
grouped = tips.groupby(['sex', 'day', 'time'])['tip'].sum().reset_index()

# Create the sunburst chart
fig = px.sunburst(
    grouped,
    path=['sex', 'day', 'time'],  # hierarchy: sex → day → time
    values='tip',
    title='Total Tips by Gender, Day, and Time',
    color='tip',
    color_continuous_scale='Blues'
)

# Show the plot
fig.show()

In [None]:
import seaborn as sns
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

# Load the Tips dataset
tips = sns.load_dataset('tips')

# Define the feature and target variable
X = tips[['total_bill']]  # Feature
y = tips['tip']           # Target

# Initialize and train the linear regression model
model = LinearRegression()
model.fit(X, y)

# Display model parameters
print(f"Intercept: {model.intercept_:.2f}")
print(f"Coefficient: {model.coef_[0]:.2f}")

# Predict tip for a total bill of $30
sample_bill = np.array([[30]])
predicted_tip = model.predict(sample_bill)
print(f"Predicted tip for a $30 bill: ${predicted_tip[0]:.2f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import itertools

# Load dataset
dataset_path = 'https://raw.githubusercontent.com/Koldim2001/test_api/refs/heads/main/titanic.csv'
df = pd.read_csv(dataset_path)

# Select and preprocess features
features = ['Pclass', 'Age', 'Fare', 'SibSp', 'Parch', 'Sex', 'Embarked']
df = df[['Survived'] + features]

# Drop rows with missing values
df.dropna(inplace=True)

# Define categorical and numerical columns
cat_features = ['Sex', 'Embarked']
num_features = ['Pclass', 'Age', 'Fare', 'SibSp', 'Parch']

# One-hot encode categorical features
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), cat_features)
], remainder='passthrough')  # keep numerical as-is

# Split dataset
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Separate X and y
X_train = train.drop(columns='Survived')
y_train = train['Survived']
X_test = test.drop(columns='Survived')
y_test = test['Survived']

# Build pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=5, min_samples_split=150, random_state=42))
])

# Train model
model.fit(X_train, y_train)

# Evaluate
preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)
cm = confusion_matrix(y_test, preds)

print("Updated model accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, preds, target_names=['Not Survived', 'Survived']))

# Plot confusion matrix
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    ticks = np.arange(len(classes))
    plt.xticks(ticks, classes, rotation=45)
    plt.yticks(ticks, classes)
    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

plot_confusion_matrix(cm, classes=['Not Survived', 'Survived'])

# Save model
with open('../outputs/models/model_dt_updated.pkl', 'wb') as f:
    pickle.dump(model, f)

# Predict for a new person
new_passenger = pd.DataFrame({
    'Pclass': [3],
    'Age': [55],
    'Fare': [7.25],
    'SibSp': [0],
    'Parch': [0],
    'Sex': ['male'],
    'Embarked': ['S']
})

prediction = model.predict(new_passenger)
print(f"\nPrediction for new passenger: {prediction}")
print("This person is most likely a survivor." if prediction[0] == 1 else "This person most likely perished.")

# Extract feature names after encoding
feature_names = list(model.named_steps['preprocessor'].get_feature_names_out()) + num_features

# Feature importance
tree_model = model.named_steps['classifier']
importances = tree_model.feature_importances_

# Get one-hot encoded feature names
ohe_feature_names = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(['Sex', 'Embarked'])

# Combine them with numeric feature names
full_feature_names = list(ohe_feature_names) + num_features

# Get importances from the classifier
tree_model = model.named_steps['classifier']
importances = tree_model.feature_importances_

# Ensure they are the same length
print("Feature names:", len(full_feature_names))
print("Importances:", len(importances))

# Now create DataFrame
importance_df = pd.DataFrame({
    'feature': full_feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

assert len(full_feature_names) == len(importances), "Mismatch between features and importances!"


# Bar chart of feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature', palette='viridis')
plt.title("Feature Importance in Decision Tree")
plt.tight_layout()
plt.show()

def experiment(max_depth, min_samples_split):
    """
    Builds and trains Decision Tree model
    """
    # Build and train Decision Tree model
    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, random_state=42)
    model.fit(train.drop('Survived', axis=1), train['Survived'])

    # Calculate accuracy metrics
    preds = model.predict(test.drop('Survived', axis=1))
    acc = accuracy_score(test['Survived'], preds)
    cm = confusion_matrix(test['Survived'], preds)

    print("accuracy", acc)

    # Plot confusion matrix
    plot_confusion_matrix(cm, classes=['Not Survived', 'Survived'])

    # Classification report
    report = classification_report(test['Survived'], preds, target_names=['Not Survived', 'Survived'])
    print(report)

    # Save model in pickle format
    with open('../outputs/models/model_dt.pkl', 'wb') as f:
        pickle.dump(model, f)
        


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import itertools

# Load dataset
dataset_path = 'https://raw.githubusercontent.com/Koldim2001/test_api/refs/heads/main/titanic.csv'
df = pd.read_csv(dataset_path)

# Select and preprocess features
features = ['Pclass', 'Age', 'Fare', 'SibSp', 'Parch', 'Sex', 'Embarked']
df = df[['Survived'] + features]
df.dropna(inplace=True)

# Define categorical and numerical columns
cat_features = ['Sex', 'Embarked']
num_features = ['Pclass', 'Age', 'Fare', 'SibSp', 'Parch']

# One-hot encode categorical features
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(drop='first'), cat_features)
], remainder='passthrough')

# Split dataset
train, test = train_test_split(df, test_size=0.2, random_state=42)
X_train = train.drop(columns='Survived')
y_train = train['Survived']
X_test = test.drop(columns='Survived')
y_test = test['Survived']

# Build and train pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=5, min_samples_split=150, random_state=42))
])
model.fit(X_train, y_train)

# Evaluate
preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)
cm = confusion_matrix(y_test, preds)
print("Updated model accuracy:", acc)
print("\nClassification Report:\n", classification_report(y_test, preds, target_names=['Not Survived', 'Survived']))

# Plot confusion matrix
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.figure()
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    ticks = np.arange(len(classes))
    plt.xticks(ticks, classes, rotation=45)
    plt.yticks(ticks, classes)
    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

plot_confusion_matrix(cm, classes=['Not Survived', 'Survived'])

# Save model
with open('../outputs/models/model_dt_updated.pkl', 'wb') as f:
    pickle.dump(model, f)

# Predict for new passenger
new_passenger = pd.DataFrame({
    'Pclass': [3],
    'Age': [55],
    'Fare': [7.25],
    'SibSp': [0],
    'Parch': [0],
    'Sex': ['male'],
    'Embarked': ['S']
})
prediction = model.predict(new_passenger)
print(f"\nPrediction for new passenger: {prediction}")
print("This person is most likely a survivor." if prediction[0] == 1 else "This person most likely perished.")

# Feature importance
ohe_feature_names = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(cat_features)
full_feature_names = list(ohe_feature_names) + num_features
importances = model.named_steps['classifier'].feature_importances_

assert len(full_feature_names) == len(importances), "Mismatch between features and importances!"

importance_df = pd.DataFrame({
    'feature': full_feature_names,
    'importance': importances
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature', palette='viridis')
plt.title("Feature Importance in Decision Tree")
plt.tight_layout()
plt.show()

# Visualize decision tree
plt.figure(figsize=(20, 10))
plot_tree(model.named_steps['classifier'], 
          feature_names=full_feature_names, 
          class_names=['Not Survived', 'Survived'], 
          filled=True, max_depth=5)
plt.title("Decision Tree Visualization")
plt.tight_layout()
plt.show()

# Training vs validation error plot
errors_list = []
for md in range(1, 21):
    temp_model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(max_depth=md, random_state=42))
    ])
    temp_model.fit(X_train, y_train)
    train_err = 1 - temp_model.score(X_train, y_train)
    test_err = 1 - temp_model.score(X_test, y_test)
    errors_list.append({'Max Depth': md, 'Training Error': train_err, 'Validation Error': test_err})

errors_df = pd.DataFrame(errors_list)

plt.figure(figsize=(10, 6))
plt.plot(errors_df['Max Depth'], errors_df['Training Error'], marker='o', label='Training Error')
plt.plot(errors_df['Max Depth'], errors_df['Validation Error'], marker='o', label='Validation Error')
plt.title("Training vs Validation Error by Tree Depth")
plt.xlabel("Max Tree Depth")
plt.ylabel("Prediction Error (1 - Accuracy)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
