<a href="https://colab.research.google.com/github/FrankieBoyC/DataCleaningProject/blob/main/UnitThreeTryIt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import seaborn as sns
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn import cluster



titanic = sns.load_dataset('titanic')
df = pd.DataFrame(titanic)
df

In [None]:
# Number One
# Finding Outliers
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1

threshold = 1.5
outliers = df[(df['age'] < Q1 - threshold * IQR) | (df['age'] > Q3 + threshold * IQR)]
print(outliers)

In [None]:
# Number Two
# Scatter Plot
plt.scatter(df['age'], df['fare'])
plt.title('Titanic Scatter Plot')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show

In [None]:
# Histogram
plt.figure(figsize=(8, 6))
plt.hist(df['fare'], bins=20)
plt.title('Titanic Histogram')
plt.xlabel('Fare')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Box Plot
plt.figure(figsize=(8, 6))
df.boxplot(column='age', by='sex')
plt.title('Titanic Box Plot')
plt.ylabel ('Blood Pressure (bp)')
plt.show()

In [None]:
# Number Three
# Correlation Matrix
# adult_men had a very strong negative correlation so it is easy to point out that women and children survived at a higher rate.
correlation = df.corr(numeric_only = True)
print(correlation)

In [None]:
# Number Four
# Prediction Model
# Drop columns that are not useful for prediction
X = df.drop(columns=['survived', 'class', 'parch'])

# Handle missing values appropriately, e.g., filling with median or mode
X['age'].fillna(X['age'].median(), inplace=True)
X['embarked'].fillna(X['embarked'].mode()[0], inplace=True)

# Select categorical columns and numeric columns
categorical_cols = ['sex', 'embarked']
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Create a preprocessing pipeline
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Split data into training and test sets
y = df['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply preprocessing to training and test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Train the model
lr = LinearRegression()
lr.fit(X_train_processed, y_train)

# Predict on test data
pred = lr.predict(X_test_processed).reshape(-1, 1)
print(pred)

In [None]:
# Function to make a prediction
def predict_survival(passenger_class, sex, passenger_age, siblings_spouses, parents_children, ticket_fare, port_embarked, travel_class, person_type, male, deck_level, home_town, survival_status, solo_traveler):
    data_to_predict = np.array([[passenger_class, sex, passenger_age, siblings_spouses, parents_children, ticket_fare, port_embarked, travel_class, person_type, male, deck_level, home_town, survival_status, solo_traveler]])
    lR_model = LinearRegression()
    lR_model.fit(data_to_predict, np.array([0]))
    predicted = lR_model.predict(data_to_predict).reshape(-1, 1)
    if predicted == 0:
        print("Did not survive")
    else:
        print("Survived")

# function call
predict_survival(3, 0, 25.0, 0, 0, 8.05, 0, 3, 1, 1, 3, 0, 0, 1)

In [None]:
# Number Five
# K-Means Clustering
from sklearn.datasets import make_blobs
from seaborn.utils import plt
from sklearn.cluster import KMeans

X2, y2 = make_blobs(n_samples=200, centers=3,
                       cluster_std=0.50, random_state=0)

plt.scatter(X2[:, 0], X2[:, 1], s=50,alpha=0.5, cmap='viridis')

In [None]:
# Number Six
iris = sns.load_dataset('iris')
df = pd.DataFrame(iris)

import plotly.express as px
fig = px.scatter(titanic, x='sepal_length', y='sepal_width', hover_name='species') #The info shows by hovering the mouse on a data point in a Plotly visualization.
fig.update_layout(title_text="Iris Data Set Cluster",
                  title_font_size=30)
fig.show()
