In [None]:
# prompt: write a script to perform data analysis and visualization using breast cancer data set

from sklearn.datasets import load_breast_cancer
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the breast cancer dataset
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = pd.Series(cancer.target, name='target')

# Combine features and target for easier analysis
df = pd.concat([X, y], axis=1)

# Basic Data Analysis
print("Dataset shape:", df.shape)
print("\nDataset info:")
df.info()
print("\nDescriptive statistics:")
print(df.describe())
print("\nTarget distribution:")
print(df['target'].value_counts())

# Correlation matrix (excluding the target for feature analysis)
correlation_matrix = X.corr()

# Data Visualization

# 1. Distribution of Features (using a few examples)
features_to_plot = ['mean radius', 'mean texture', 'mean perimeter', 'mean area']
df_melted = df.melt(id_vars=['target'], value_vars=features_to_plot, var_name='feature', value_name='value')

plt.figure(figsize=(12, 8))
sns.violinplot(x='feature', y='value', hue='target', data=df_melted, split=True, inner='quart', palette='muted')
plt.title('Distribution of Selected Features by Target')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 2. Pairplot for a subset of features (can be computationally expensive for all features)
# Selecting a few key features for the pairplot
selected_features_for_pairplot = ['mean radius', 'mean texture', 'mean perimeter', 'mean area', 'target']
sns.pairplot(df[selected_features_for_pairplot], hue='target', diag_kind='kde', palette='viridis')
plt.suptitle('Pairplot of Selected Features', y=1.02)
plt.show()

# 3. Heatmap of the Correlation Matrix
plt.figure(figsize=(15, 12))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm') # annot=True if you want to see correlation values
plt.title('Correlation Matrix of Features')
plt.show()

# 4. Boxplot of a few features comparing benign (0) and malignant (1)
features_for_boxplot = ['worst radius', 'worst texture', 'worst perimeter', 'worst area']
plt.figure(figsize=(12, 8))
df_melted_boxplot = df.melt(id_vars=['target'], value_vars=features_for_boxplot, var_name='feature', value_name='value')
sns.boxplot(x='feature', y='value', hue='target', data=df_melted_boxplot, palette='pastel')
plt.title('Boxplot of Worst Features by Target')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# You can add more visualizations based on specific analysis goals,
# for example, scatter plots of highly correlated features, etc.

In [None]:
# prompt: write a script to perform data analysis and visualizatio using iris data set

from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the iris dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target, name='target')

# Combine features and target for easier analysis
df = pd.concat([X, y], axis=1)

# Replace numerical target with species names for better readability in plots
df['target'] = df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})

# Basic Data Analysis
print("Dataset shape:", df.shape)
print("\nDataset info:")
df.info()
print("\nDescriptive statistics:")
print(df.describe())
print("\nTarget distribution:")
print(df['target'].value_counts())

# Correlation matrix (excluding the target as it's categorical)
correlation_matrix = X.corr()

# Data Visualization

# 1. Distribution of Features (using all features)
features_to_plot = iris.feature_names
df_melted = df.melt(id_vars=['target'], value_vars=features_to_plot, var_name='feature', value_name='value')

plt.figure(figsize=(12, 8))
sns.violinplot(x='feature', y='value', hue='target', data=df_melted, split=True, inner='quart', palette='muted')
plt.title('Distribution of Features by Iris Species')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 2. Pairplot to visualize relationships between features and target
sns.pairplot(df, hue='target', diag_kind='kde', palette='viridis')
plt.suptitle('Pairplot of Iris Dataset Features', y=1.02)
plt.show()

# 3. Heatmap of the Correlation Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Iris Features')
plt.show()

# 4. Boxplot of features comparing different species
plt.figure(figsize=(12, 8))
df_melted_boxplot = df.melt(id_vars=['target'], value_vars=features_to_plot, var_name='feature', value_name='value')
sns.boxplot(x='feature', y='value', hue='target', data=df_melted_boxplot, palette='pastel')
plt.title('Boxplot of Features by Iris Species')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

# 5. Scatter plots for specific feature relationships, e.g., sepal length vs petal length
plt.figure(figsize=(8, 6))
sns.scatterplot(x='sepal length (cm)', y='petal length (cm)', hue='target', data=df, palette='viridis', s=100)
plt.title('Sepal Length vs Petal Length by Species')
plt.show()

# You can explore other feature combinations as well.
