# Step 1: *Load the Data*

In [None]:
import pandas as pd

# Load the data
df = pd.read_csv('data.csv')

# Display the first few rows of the dataframe
print(df.head())

# Step 2: *Data Cleaning*

In [None]:
# Check for missing values
print(df.isnull().sum())

# Drop rows with missing values (or you can fill them with a value)
df = df.dropna()

# Convert data types if necessary
df['column_name'] = df['column_name'].astype('int')

# Remove duplicates
df = df.drop_duplicates()

# Display the cleaned dataframe
print(df.info())

# Step 3: *Exploratory Data Analysis (EDA)*

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Descriptive statistics
print(df.describe())

# Histogram of a specific column
plt.figure(figsize=(10, 6))
sns.histplot(df['column_name'], kde=True)
plt.title('Distribution of Column Name')
plt.show()

# Scatter plot to visualize relationship between two variables
plt.figure(figsize=(10, 6))
sns.scatterplot(x='column1', y='column2', data=df)
plt.title('Column1 vs Column2')
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

# Step 4: *Basic Analysis*

In [None]:
# Group by and aggregate
grouped_df = df.groupby('category_column')['value_column'].mean().reset_index()
print(grouped_df)

# Apply a statistical method (e.g., linear regression)
from sklearn.linear_model import LinearRegression

# Prepare data for regression
X = df[['feature1', 'feature2']]  # independent variables
y = df['target']  # dependent variable

# Initialize and fit the model
model = LinearRegression()
model.fit(X, y)

# Print model coefficients
print(f'Intercept: {model.intercept_}')
print(f'Coefficients: {model.coef_}')

# Step 5: *Save the Results*

In [None]:
# Save the grouped data to a CSV file
grouped_df.to_csv('grouped_data.csv', index=False)

# Save the cleaned data
df.to_csv('cleaned_data.csv', index=False)