In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis, ttest_ind
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load the dataset into a Pandas DataFrame
df = pd.read_csv('diamonds.csv')

# View the first 5 rows of the DataFrame
print(df.head())

# Check for missing values in the DataFrame
print(df.isnull().sum())

# Check for duplicate rows in the DataFrame
print(df.duplicated().sum())

# View the data types of the columns in the DataFrame
print(df.dtypes)

# Create boxplots for the numerical columns in the DataFrame
plt.boxplot([df['carat'], df['depth'], df['table'], df['price'], df['x'], df['y'], df['z']])
plt.show()

# Create histograms for the numerical columns in the DataFrame
df.hist(bins=50, figsize=(20, 15))
plt.show()

# Create a heatmap of the correlation matrix between the numerical columns
corr = df.corr()
sns.heatmap(corr, cmap='coolwarm', annot=True)
plt.show()

# Create a scatterplot of the price and carat variables
plt.scatter(df['carat'], df['price'])
plt.xlabel('Carat')
plt.ylabel('Price')
plt.show()

# Create a new feature 'volume' by multiplying the length, width, and depth
df['volume'] = df['x'] * df['y'] * df['z']

# Create a bar chart of the categorical variable 'cut'
df['cut'].value_counts().plot(kind='bar')
plt.xlabel('Cut')
plt.ylabel('Count')
plt.show()

# Create a scatterplot matrix of the numerical variables
sns.pairplot(df[['carat', 'depth', 'table', 'price']])
plt.show()

# Create a time-series plot of the price variable
df.plot(x='date', y='price')
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()

# Perform a t-test to compare the prices of diamonds with a 'Premium' cut and those with a 'Good' cut
premium_prices = df[df['cut'] == 'Premium']['price']
good_prices = df[df['cut'] == 'Good']['price']
t_stat, p_value = ttest_ind(premium_prices, good_prices)
print('t-statistic:', t_stat)
print('p-value:', p_value)

# Calculate the skewness and kurtosis of the numerical columns
print('Skewness:', skew(df[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']]))
print('Kurtosis:', kurtosis(df[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']]))

# Create a countplot of the 'cut' variable
sns.countplot(data=df, x='cut')
plt.show()

# Create a pie chart of the 'color' variable
df['color'].value_counts().plot(kind='pie')
plt.axis('equal')
plt.show()

# Calculate the VIF (Variance Inflation Factor) for the numerical columns
X = df[['carat', 'depth', 'table', 'x', 'y', 'z']]
vif = pd.DataFrame()
vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif['feature'] = X.columns
print(vif)

# Create a histogram of the 'price' variable
sns.histplot(df['price'], kde=True)
plt.show()



# Load the dataset into a Pandas DataFrame
df = pd.read_csv('diamonds.csv')

# Check for outliers in the numerical columns
sns.boxplot(data=df[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']])
plt.show()

# Remove the outliers from the 'price' variable using the Interquartile Range (IQR) method
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
df = df[(df['price'] >= Q1 - 1.5*IQR) & (df['price'] <= Q3 + 1.5*IQR)]

# Check for the distribution of the 'price' variable after removing the outliers
sns.histplot(df['price'], kde=True)
plt.show()

# Create a boxplot of the 'price' variable grouped by the 'cut' variable
sns.boxplot(data=df, x='cut', y='price')
plt.show()

# Create a scatterplot of the 'price' variable and the new 'volume' variable
plt.scatter(df['volume'], df['price'])
plt.xlabel('Volume')
plt.ylabel('Price')
plt.show()

# Create a bar chart of the categorical variable 'color'
df['color'].value_counts().plot(kind='bar')
plt.xlabel('Color')
plt.ylabel('Count')
plt.show()

# Create a countplot of the 'clarity' variable
sns.countplot(data=df, x='clarity')
plt.show()

# Create a scatterplot matrix of the numerical variables grouped by the 'cut' variable
sns.pairplot(df[['carat', 'depth', 'table', 'price', 'cut']], hue='cut')
plt.show()

# Create a heatmap of the correlation matrix between the numerical variables grouped by the 'cut' variable
cuts = df['cut'].unique()
for cut in cuts:
    df_cut = df[df['cut'] == cut]
    corr = df_cut[['carat', 'depth', 'table', 'price', 'x', 'y', 'z']].corr()
    sns.heatmap(corr, cmap='coolwarm', annot=True)
    plt.title(f'Correlation matrix ({cut} cut)')
    plt.show()

# Create a violin plot of the 'price' variable grouped by the 'cut' variable
sns.violinplot(data=df, x='cut', y='price')
plt.show()

# Create a swarm plot of the 'price' variable grouped by the 'cut' and 'color' variables
sns.swarmplot(data=df, x='cut', y='price', hue='color')
plt.show()

# Calculate the summary statistics of the 'price' variable grouped by the 'cut' and 'color' variables
summary_stats = df.groupby(['cut', 'color'])['price'].describe()
print(summary_stats)


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

df = sns.load_dataset('tips')

profile = ProfileReport(df, title="Profiling Report")
profile.to_file("profile.html")


Summarize dataset: 100%|██████████| 25/25 [00:00<00:00, 34.61it/s, Completed]                     
Generate report structure: 100%|██████████| 1/1 [00:00<00:00,  1.35it/s]
Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]