# Import Required Libraries
Import the necessary libraries, including Pandas and NumPy.

In [6]:
# Import the necessary libraries, including Pandas and NumPy
import pandas as pd
import numpy as np

# Load Data into DataFrame
Load data from a CSV file into a Pandas DataFrame.

In [7]:
# Load Data into DataFrame

# Load data from a CSV file into a Pandas DataFrame
df = pd.read_csv('data.csv')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,id,name,age,department,salary
0,1,John Doe,28,Engineering,70000
1,2,Jane Smith,34,Marketing,80000
2,3,Emily Davis,22,Sales,45000
3,4,Michael Brown,45,HR,60000
4,5,Jessica Wilson,30,Engineering,75000


# DataFrame Basic Operations
Perform basic operations on the DataFrame, such as selecting columns, filtering rows, and sorting data.

In [9]:
import pandas as pd

# DataFrame Basic Operations

# Select specific columns from the DataFrame
selected_columns = df[['name', 'department']]
print("\nSelected columns (name, department):")
print(selected_columns.head())

# Filter rows based on a condition
filtered_rows = df[df['salary'] > 60000]
print("\nFiltered rows where salary > 60000:")
print(filtered_rows.head())


Selected columns (name, department):
             name   department
0        John Doe  Engineering
1      Jane Smith    Marketing
2     Emily Davis        Sales
3   Michael Brown           HR
4  Jessica Wilson  Engineering

Filtered rows where salary > 60000:
   id             name  age   department  salary
0   1         John Doe   28  Engineering   70000
1   2       Jane Smith   34    Marketing   80000
4   5   Jessica Wilson   30  Engineering   75000
5   6  Daniel Martinez   40    Marketing   85000
7   8   James Anderson   50           HR   65000


# Data Cleaning
Clean the data by handling missing values, removing duplicates, and transforming data types.

In [None]:
# Data Cleaning

# Handle missing values by filling them with the mean of the column
df.fillna(df.mean(), inplace=True)

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Transform data types, for example, convert a column to datetime
df['date_column'] = pd.to_datetime(df['date_column'])

# Display the cleaned DataFrame
df.head()

# Data Analysis
Analyze the data using descriptive statistics, groupby operations, and pivot tables.

In [None]:
# Data Analysis

# Descriptive statistics
df.describe()

# Groupby operations
grouped_df = df.groupby('column1').mean()
grouped_df.head()

# Pivot tables
pivot_table = df.pivot_table(values='column2', index='column1', columns='column3', aggfunc=np.mean)
pivot_table.head()

# Data Visualization
Visualize the data using libraries such as Matplotlib and Seaborn.

In [None]:
# Data Visualization

# Import necessary libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Set the aesthetic style of the plots
sns.set_style("whitegrid")

# Plot a histogram of a specific column
plt.figure(figsize=(10, 6))
sns.histplot(df['column1'], bins=30, kde=True)
plt.title('Histogram of Column1')
plt.xlabel('Column1')
plt.ylabel('Frequency')
plt.show()

# Plot a scatter plot between two columns
plt.figure(figsize=(10, 6))
sns.scatterplot(x='column1', y='column2', data=df)
plt.title('Scatter Plot of Column1 vs Column2')
plt.xlabel('Column1')
plt.ylabel('Column2')
plt.show()

# Plot a box plot of a specific column
plt.figure(figsize=(10, 6))
sns.boxplot(x='column3', y='column2', data=df)
plt.title('Box Plot of Column2 by Column3')
plt.xlabel('Column3')
plt.ylabel('Column2')
plt.show()

# Plot a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Heatmap of Correlation Matrix')
plt.show()