# DATA1002 - Data Analysis Project

This notebook contains the main data analysis for the DATA1002 university project.

## 1. Setup and Imports

Import necessary libraries for data analysis and visualization.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set style for better-looking plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

## 2. Data Loading

Load the dataset for analysis.

In [None]:
# Example: Load data from CSV file
# df = pd.read_csv('../data/dataset.csv')

# For demonstration, create a sample dataset
np.random.seed(42)
df = pd.DataFrame({
    'id': range(1, 101),
    'category': np.random.choice(['A', 'B', 'C'], 100),
    'value1': np.random.normal(50, 10, 100),
    'value2': np.random.normal(100, 20, 100),
    'score': np.random.uniform(0, 100, 100)
})

print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
df.head()

## 3. Exploratory Data Analysis

### 3.1 Basic Statistics

In [None]:
# Display basic information about the dataset
print("Dataset Info:")
print(df.info())
print("\nBasic Statistics:")
df.describe()

### 3.2 Missing Values Check

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing Values:")
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")

## 4. Data Visualization

### 4.1 Distribution Plots

In [None]:
# Create distribution plots for numerical variables
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(df['value1'], bins=20, edgecolor='black')
axes[0].set_title('Distribution of Value1')
axes[0].set_xlabel('Value1')
axes[0].set_ylabel('Frequency')

axes[1].hist(df['value2'], bins=20, edgecolor='black')
axes[1].set_title('Distribution of Value2')
axes[1].set_xlabel('Value2')
axes[1].set_ylabel('Frequency')

axes[2].hist(df['score'], bins=20, edgecolor='black')
axes[2].set_title('Distribution of Score')
axes[2].set_xlabel('Score')
axes[2].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

### 4.2 Box Plots by Category

In [None]:
# Box plots to compare categories
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

df.boxplot(column='value1', by='category', ax=axes[0])
axes[0].set_title('Value1 by Category')
axes[0].set_xlabel('Category')
axes[0].set_ylabel('Value1')

df.boxplot(column='score', by='category', ax=axes[1])
axes[1].set_title('Score by Category')
axes[1].set_xlabel('Category')
axes[1].set_ylabel('Score')

plt.tight_layout()
plt.show()

### 4.3 Scatter Plot and Correlation

In [None]:
# Scatter plot to show relationship between variables
plt.figure(figsize=(10, 6))
plt.scatter(df['value1'], df['value2'], c=df['score'], cmap='viridis', alpha=0.6)
plt.colorbar(label='Score')
plt.xlabel('Value1')
plt.ylabel('Value2')
plt.title('Relationship between Value1 and Value2')
plt.show()

# Correlation matrix
print("Correlation Matrix:")
correlation = df[['value1', 'value2', 'score']].corr()
print(correlation)

### 4.4 Correlation Heatmap

In [None]:
# Heatmap of correlations
plt.figure(figsize=(8, 6))
sns.heatmap(correlation, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=1, cbar_kws={"shrink": 0.8})
plt.title('Correlation Heatmap')
plt.show()

## 5. Statistical Analysis

### 5.1 Group Statistics

In [None]:
# Calculate statistics by category
group_stats = df.groupby('category').agg({
    'value1': ['mean', 'std', 'min', 'max'],
    'value2': ['mean', 'std', 'min', 'max'],
    'score': ['mean', 'std', 'min', 'max']
})

print("Statistics by Category:")
print(group_stats)

### 5.2 Hypothesis Testing (ANOVA)

In [None]:
# Perform ANOVA to test if means differ across categories
categories = df['category'].unique()
groups = [df[df['category'] == cat]['value1'].values for cat in categories]

f_stat, p_value = stats.f_oneway(*groups)

print(f"ANOVA Results for Value1:")
print(f"F-statistic: {f_stat:.4f}")
print(f"P-value: {p_value:.4f}")

if p_value < 0.05:
    print("\nConclusion: There is a significant difference between categories (p < 0.05)")
else:
    print("\nConclusion: No significant difference between categories (p >= 0.05)")

## 6. Summary and Conclusions

### Key Findings:

1. **Data Overview**: The dataset contains [summary of data]
2. **Distributions**: [Describe the distributions observed]
3. **Relationships**: [Describe correlations and relationships found]
4. **Statistical Significance**: [Summarize hypothesis testing results]

### Next Steps:

- Further investigation of [specific findings]
- Additional analysis of [other aspects]
- Consider [potential improvements or extensions]

## 7. Save Results

In [None]:
# Save processed data or results if needed
# df.to_csv('../data/processed_data.csv', index=False)
# print("Results saved successfully!")