<a href="https://colab.research.google.com/github/GadhamsettyPranay/Amazon-Prime-Movies-and-TV-Shows-using-Power-BI/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#21BDS0295
#Module-1

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# 1. Loading the Dataset from the URL
url = 'https://raw.githubusercontent.com/salemprakash/EDA/main/Data/txhousing.csv'
df = pd.read_csv(url)
print("First 5 rows of the dataset:")
print(df.head())

# 2. Summary Statistics for Numerical and Categorical Data
print("\nSummary statistics for numerical data:")
print(df.describe())  # Summary for numerical features

# Checking for categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("\nCategorical Columns:")
print(categorical_cols)

# Summary for categorical columns
if len(categorical_cols) > 0:
    print("\nSummary for categorical columns:")
    print(df[categorical_cols].describe())

# 3. Data Cleaning (Handling Missing Values)
print("\nMissing values in the dataset:")
print(df.isnull().sum())

# Dropping rows with missing data
df_cleaned = df.dropna()
print("\nData after dropping missing values:")
print(df_cleaned.head())

# Filling missing data
df_filled = df.fillna(method='ffill')
print("\nData after forward fill:")
print(df_filled.head())

# 4. Visualization (for Numerical and Categorical Data)
# Histogram for continuous numerical data (Median price)
plt.figure(figsize=(8, 6))
df['median'].hist(bins=20)
plt.title('Histogram of Median Prices')
plt.xlabel('Median Price')
plt.ylabel('Frequency')
plt.show()

# Bar plot for a categorical variable (if available)
if len(categorical_cols) > 0:
    plt.figure(figsize=(8, 6))
    df[categorical_cols[0]].value_counts().plot(kind='bar')
    plt.title(f'Bar Chart of {categorical_cols[0]}')
    plt.xlabel(categorical_cols[0])
    plt.ylabel('Frequency')
    plt.show()

# 5. Measurement Scales (Nominal, Ordinal, Interval, Ratio)
# For Nominal (e.g., city names):
if 'city' in df.columns:
    print("\nValue counts for 'city' column (Nominal scale example):")
    print(df['city'].value_counts())

# For Ordinal (Creating an ordinal column based on median prices)
df['rank'] = pd.qcut(df['median'], 5, labels=['very low', 'low', 'medium', 'high', 'very high'])
print("\nFirst 5 rows with 'rank' (Ordinal scale example):")
print(df[['median', 'rank']].head())

# 6. Comparing EDA with Classical and Bayesian Analysis
# Classical Analysis (T-test)
t_stat, p_value = stats.ttest_1samp(df['median'].dropna(), popmean=150000)
print(f"\nT-test: t-stat = {t_stat}, p-value = {p_value}")

# Bayesian Analysis (PyMC3 or other packages can be used; requires specific setup)

# 7. Software Tools for EDA
print("\nSoftware Tools for EDA:")
print("Python: Pandas, Seaborn, Matplotlib")
print("R: GGPlot2, dplyr, tidyr")
print("Excel/Tableau: For basic EDA tasks")


In [None]:
#21BDS0295
#MODULE-2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer

# 1. Loading the Dataset
url = 'https://raw.githubusercontent.com/salemprakash/EDA/main/Data/txhousing.csv'
df = pd.read_csv(url)
print("First 5 rows of the dataset:")
print(df.head())

# 2. Data Deduplication
print("\nShape before deduplication:", df.shape)
df_deduped = df.drop_duplicates()
print("Shape after deduplication:", df_deduped.shape)

# 3. Replacing Values
# Replace specific values in a column
df_replaced = df.copy()
df_replaced['sales'] = df_replaced['sales'].replace(0, np.nan)  # Example: Replace '0' in 'sales' with NaN
print("\nSales column after replacing 0 with NaN:")
print(df_replaced[['sales']].head())

# Replace NaN values with a specific value
df_replaced.fillna(value={'sales': df_replaced['sales'].mean()}, inplace=True)
print("\nSales column after replacing NaN with mean value:")
print(df_replaced[['sales']].head())

# 4. Discretization and Binning
# Bin 'median' column into categories
df_binned = df.copy()
df_binned['median_binned'] = pd.cut(df_binned['median'].fillna(df_binned['median'].mean()), bins=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']) # Fill NaN values with the mean of the column before binning
print("\nBinned 'median' column:")
print(df_binned[['median', 'median_binned']].head())

# 5. Handling Missing Data - Traditional Method (Maximum Likelihood Estimation)
# Introduce some missing values for demonstration
df_with_na = df.copy()
df_with_na.loc[10:15, 'sales'] = np.nan  # Creating missing values in 'sales'

# Handle missing data using SimpleImputer (Maximum Likelihood Estimation approximated via mean substitution here)
imputer = SimpleImputer(strategy='mean')
df_with_na['sales'] = imputer.fit_transform(df_with_na[['sales']])
print("\nSales column after imputing missing values using MLE approximation (mean):")
print(df_with_na[['sales']].head(20))

# 6. Discretization with KBinsDiscretizer
kbins = KBinsDiscretizer(n_bins=4, encode='ordinal', strategy='uniform')
# Impute missing values with the mean of the column before discretization
df['median_binned_kbins'] = kbins.fit_transform(df[['median']].fillna(df['median'].mean()))
print("\nDiscretized 'median' column using KBinsDiscretizer:")
print(df[['median', 'median_binned_kbins']].head())

# 7. Visualizing Binned Data
plt.figure(figsize=(10, 6))
# Access the 'median_binned' column from the correct DataFrame (df_binned)
df_binned['median_binned'].value_counts().plot(kind='bar')
plt.title('Distribution of Median Price Bins')
plt.xlabel('Median Price Bins')
plt.ylabel('Frequency')
plt.show()


In [None]:
#21BDS0295
#MODULE-3

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the dataset
url = 'https://raw.githubusercontent.com/salemprakash/EDA/main/Data/txhousing.csv'
df = pd.read_csv(url)

# 1. Univariate Analysis
print("Univariate Analysis (Summary Statistics):")
print(df.describe())

# Visualizing Univariate Data (Histograms for Numerical Features)
plt.figure(figsize=(10, 6))
df['median'].hist(bins=20)
plt.title('Histogram of Median Prices')
plt.xlabel('Median Price')
plt.ylabel('Frequency')
plt.show()

# 2. Bivariate Analysis
print("\nBivariate Analysis (Correlation Matrix):")
# Select only numerical columns for correlation calculation
correlation_matrix = df.select_dtypes(include=np.number).corr()
print(correlation_matrix)

# Heatmap of Correlation Matrix
plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix Heatmap')
plt.show()

# Scatter Plot for Bivariate Analysis
plt.figure(figsize=(8, 6))
sns.scatterplot(x='median', y='sales', data=df)
plt.title('Scatter Plot of Median Prices vs Sales')
plt.xlabel('Median Price')
plt.ylabel('Sales')
plt.show()

# 3. Multivariate Analysis
print("\nMultivariate Analysis (Pairplot):")
sns.pairplot(df[['median', 'sales', 'inventory', 'volume']])
plt.show()

# 4. Time Series Analysis (TSA)
# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Time-based indexing
df.set_index('date', inplace=True)
print("\nDataset with time-based index:")
print(df.head())

# Visualizing Time Series Data
plt.figure(figsize=(10, 6))
df['sales'].plot()
plt.title('Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()

# Grouping Time Series Data (Yearly)
df['year'] = df.index.year
yearly_sales = df.groupby('year')['sales'].sum()

plt.figure(figsize=(10, 6))
yearly_sales.plot(kind='bar')
plt.title('Yearly Sales')
plt.xlabel('Year')
plt.ylabel('Total Sales')
plt.show()

# Resampling Time Series Data (Monthly)
monthly_sales = df['sales'].resample('M').sum()

plt.figure(figsize=(10, 6))
monthly_sales.plot()
plt.title('Monthly Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Monthly Sales')
plt.show()

# Resampling for Quarterly Data
quarterly_sales = df['sales'].resample('Q').sum()

plt.figure(figsize=(10, 6))
quarterly_sales.plot(kind='bar')
plt.title('Quarterly Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Quarterly Sales')
plt.show()


In [None]:
#21BDS0295
#MODULE-4

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Load the Dataset
url = 'https://raw.githubusercontent.com/salemprakash/EDA/main/Data/txhousing.csv'
df = pd.read_csv(url)
print("First 5 rows of the dataset:")
print(df.head())

# 2. Statistical Summary Measures
print("\nStatistical Summary:")
print(df.describe())  # Numerical columns summary

# 3. Data Elaboration (Show basic info and null values)
print("\nData Information:")
df.info()

print("\nMissing Values:")
print(df.isnull().sum())

# 4. 1-D Statistical Data Analysis (Univariate Analysis)
# Histogram for a numerical column (e.g., 'median')
plt.figure(figsize=(8, 6))
df['median'].plot(kind='hist', bins=20)
plt.title('1-D Statistical Analysis: Histogram of Median Prices')
plt.xlabel('Median Price')
plt.ylabel('Frequency')
plt.show()

# 5. 2-D Statistical Data Analysis (Bivariate Analysis)
# Scatter plot between 'median' and 'sales'
plt.figure(figsize=(8, 6))
plt.scatter(df['median'], df['sales'], alpha=0.6)
plt.title('2-D Statistical Analysis: Scatter Plot (Median vs Sales)')
plt.xlabel('Median Price')
plt.ylabel('Sales')
plt.show()

# 6. n-D Statistical Data Analysis (Multivariate Analysis)
# Pairplot (for 3 variables: 'median', 'sales', 'volume')
plt.figure(figsize=(8, 6))
sns.pairplot(df[['median', 'sales', 'volume']].dropna())
plt.suptitle('n-D Statistical Data Analysis: Pairplot (Median, Sales, Volume)', y=1.02)
plt.show()

# 7. Contingency Table
# Creating a categorical column from 'median' and cross-tabulating it with 'sales'
df['median_category'] = pd.qcut(df['median'], 4, labels=['Low', 'Medium', 'High', 'Very High'])
contingency_table = pd.crosstab(df['median_category'], df['sales'].apply(lambda x: 'Low' if x < 500 else 'High'))
print("\nContingency Table (Median Category vs Sales):")
print(contingency_table)

# 8. Visualization (Scatter plots, Dot charts, Bar Plots)
# Scatter Plot (Median vs Volume)
plt.figure(figsize=(8, 6))
plt.scatter(df['median'], df['volume'], alpha=0.6, color='g')
plt.title('Scatter Plot: Median vs Volume')
plt.xlabel('Median Price')
plt.ylabel('Volume')
plt.show()

# Dot Chart (Dot plot for 'sales')
plt.figure(figsize=(8, 6))
plt.plot(df['sales'], 'o', color='b')
plt.title('Dot Chart: Sales')
plt.xlabel('Index')
plt.ylabel('Sales')
plt.show()

# Bar Plot (Bar chart for median category counts)
plt.figure(figsize=(8, 6))
df['median_category'].value_counts().plot(kind='bar', color='orange')
plt.title('Bar Plot: Median Category Counts')
plt.xlabel('Median Category')
plt.ylabel('Count')
plt.show()


In [None]:
#21BDS0295
#MODULE-5

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import SpectralClustering, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.datasets import make_blobs
from scipy.sparse.csgraph import minimum_spanning_tree
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Load the dataset from the provided link
url = "https://raw.githubusercontent.com/salemprakash/EDA/main/Data/txhousing.csv"
df = pd.read_csv(url)

# Example clustering on the 'median' and 'year' columns
X = df[['median', 'year']].dropna()
X_scaled = StandardScaler().fit_transform(X)

# 1. Spectral Clustering
spectral_clustering = SpectralClustering(n_clusters=3, affinity='nearest_neighbors')
labels_spectral = spectral_clustering.fit_predict(X_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels_spectral, cmap='viridis')
plt.title('Spectral Clustering')
plt.xlabel('Scaled Median')
plt.ylabel('Year')
plt.show()

# 2. Document Clustering (Example with synthetic data)
# Generating synthetic data for document clustering
documents, _ = make_blobs(n_samples=300, centers=5, cluster_std=0.60, random_state=0)
plt.figure(figsize=(10, 6))
plt.scatter(documents[:, 0], documents[:, 1], s=30)
plt.title('Synthetic Data for Document Clustering')
plt.show()

# Using KMeans for document clustering
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5)
labels_kmeans = kmeans.fit_predict(documents)

plt.figure(figsize=(10, 6))
plt.scatter(documents[:, 0], documents[:, 1], c=labels_kmeans, cmap='viridis')
plt.title('Document Clustering using KMeans')
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

# 3. Minimum Spanning Tree Clustering
mst = minimum_spanning_tree(np.corrcoef(X_scaled.T))
plt.figure(figsize=(10, 6))
plt.spy(mst, markersize=5)
plt.title('Minimum Spanning Tree')
plt.show()

# 4. Model-based Clustering - Expectation-Maximization Algorithm
gmm = GaussianMixture(n_components=3)
gmm.fit(X_scaled)
labels_gmm = gmm.predict(X_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels_gmm, cmap='viridis')
plt.title('Model-based Clustering using EM Algorithm')
plt.xlabel('Scaled Median')
plt.ylabel('Year')
plt.show()

# 5. Hierarchical Agglomerative Clustering
agglo = AgglomerativeClustering(n_clusters=3)
labels_agglo = agglo.fit_predict(X_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels_agglo, cmap='viridis')
plt.title('Hierarchical Agglomerative Clustering')
plt.xlabel('Scaled Median')
plt.ylabel('Year')
plt.show()

# 6. Outlier Detection using Clustering
from sklearn.ensemble import IsolationForest

iso_forest = IsolationForest(contamination=0.1)
outliers = iso_forest.fit_predict(X_scaled)

plt.figure(figsize=(10, 6))
plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=outliers, cmap='coolwarm')
plt.title('Outlier Detection using Isolation Forest')
plt.xlabel('Scaled Median')
plt.ylabel('Year')
plt.show()

# Silhouette Scores for evaluating clustering performance
print("Silhouette Score for Spectral Clustering:", silhouette_score(X_scaled, labels_spectral))
print("Silhouette Score for KMeans Clustering:", silhouette_score(documents, labels_kmeans))
print("Silhouette Score for GMM Clustering:", silhouette_score(X_scaled, labels_gmm))
print("Silhouette Score for Agglomerative Clustering:", silhouette_score(X_scaled, labels_agglo))


In [None]:
#21BDS0295
#MODULE-6

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE, MDS
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Load the dataset
url = 'https://raw.githubusercontent.com/salemprakash/EDA/main/Data/txhousing.csv'
df = pd.read_csv(url)

# Select numerical columns for dimensionality reduction and clustering
numerical_cols = df.select_dtypes(include=np.number).columns
df_numerical = df[numerical_cols]

# 1. Standardize the data
scaler = StandardScaler()
# Impute NaNs with the mean of each column before scaling
df_scaled = scaler.fit_transform(df_numerical.fillna(df_numerical.mean()))

# 2. Principal Component Analysis (PCA)
pca = PCA(n_components=2)  # Reduce to 2 principal components
principal_components = pca.fit_transform(df_scaled)
df_pca = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])

# 3. t-Distributed Stochastic Neighbor Embedding (t-SNE)
tsne = TSNE(n_components=2, perplexity=30, random_state=42)
tsne_results = tsne.fit_transform(df_scaled)
df_tsne = pd.DataFrame(data=tsne_results, columns=['t-SNE1', 't-SNE2'])

# 4. Multidimensional Scaling (MDS)
mds = MDS(n_components=2, random_state=42)
mds_results = mds.fit_transform(df_scaled)
df_mds = pd.DataFrame(data=mds_results, columns=['MDS1', 'MDS2'])

# 5. K-Means Clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # Example with 3 clusters
clusters = kmeans.fit_predict(df_scaled)
df_numerical['cluster'] = clusters

# Visualization
plt.figure(figsize=(12, 4))

# PCA plot
plt.subplot(1, 3, 1)
plt.scatter(df_pca['PC1'], df_pca['PC2'])
plt.title('PCA')

# t-SNE plot
plt.subplot(1, 3, 2)
plt.scatter(df_tsne['t-SNE1'], df_tsne['t-SNE2'])
plt.title('t-SNE')

# MDS plot
plt.subplot(1, 3, 3)
plt.scatter(df_mds['MDS1'], df_mds['MDS2'])
plt.title('MDS')

plt.tight_layout()
plt.show()

# Visualize clusters (using PCA for example)
plt.figure(figsize=(8, 6))
plt.scatter(df_pca['PC1'], df_pca['PC2'], c=df_numerical['cluster'])
plt.title('Clusters visualized with PCA')
plt.show()

In [None]:
#21BDS0295
#MODULE-7

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset from the provided link
url = "https://raw.githubusercontent.com/salemprakash/EDA/main/Data/txhousing.csv"
df = pd.read_csv(url)

# Display the first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

# Data Cleaning (Handle missing values if any)
df.dropna(inplace=True)  # Drop rows with missing values

# 1. Constructing Linear Regression Model
# Assuming 'median' is the target variable and other columns are features
# Selecting numerical columns for the model
X = df.select_dtypes(include=[np.number]).drop(columns=['median'])
y = df['median']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# 2. Making Predictions
y_pred = model.predict(X_test)

# 3. Evaluating the Model
# Computing Mean Squared Error and R-squared value
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nMean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

# Plotting Actual vs Predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.6)
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')  # 45-degree line
plt.xlabel('Actual Median Prices')
plt.ylabel('Predicted Median Prices')
plt.title('Actual vs Predicted Median Prices')
plt.show()

# Understanding Accuracy
# Since this is regression, accuracy is not computed like in classification tasks
print("\nIn regression tasks, accuracy is not defined in the same way. We typically use metrics like MSE, R-squared, etc., to evaluate model performance.")

# 4. Understanding Reinforcement Learning
print("\nReinforcement Learning (RL):")
print("Reinforcement Learning is a type of machine learning where an agent learns to make decisions by taking actions in an environment to maximize cumulative reward.")
print("\nDifference between Supervised Learning and Reinforcement Learning:")
print("1. Supervised Learning: The model learns from labeled data (input-output pairs).")
print("2. Reinforcement Learning: The model learns from the consequences of its actions (rewards or penalties).")

# Applications of Reinforcement Learning
print("\nApplications of Reinforcement Learning:")
print("1. Robotics: For teaching robots to perform tasks through trial and error.")
print("2. Game Playing: For training agents to play games like Chess or Go.")
print("3. Autonomous Vehicles: For navigation and decision-making.")
print("4. Personalized Recommendations: For dynamically adjusting content based on user interactions.")
