# 1) Libraries import - Yeh sab tools hain jo hum analysis ke liye use karenge  

import pandas as pd
  # Data ko table ki form mein manage karne ke liye (Excel jaisa)

import matplotlib.pyplot as plt
  # Graphs banane ke liye

import seaborn as sns
  # Matlab ke stylish graphs banane ke liye (seaborn matplotlib ka upgraded version hai)



# 2) Titanic dataset load - Internet se direct data utha rahe hain

df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")
# df = dataframe (ek type ka digital table jisme sab data store hoga)
# pd.read_csv() = pandas ka function jo CSV file ko load karta hai
# Yeh link ek public dataset ka hai jo Titanic ki information rakhta hai



# 3) Data ka overview - Dataset ki basic jankari lene ke liye

print(df.info())  
# df.info() = batata hai kitne columns hain, unke names, aur kitni values missing hain
# print() = console pe output dikhane ke liye

display(df.head())  
# df.head() = dataframe ke first 5 rows dikhata hai

## Task 1 Insights
- **Survival Rate**: ~38% passengers survived.
- **First Class**: Greatest survival rate.
- **Age**: Majority between 20–40 years.
- **Correlations**: Fare strongly linked to passenger class.


# — Task 2: Text Sentiment Analysis using NLTK movie_reviews corpus

import nltk
# Pehle required corpora download karo (sirf pehli baar run karna hai)
nltk.download('punkt')
nltk.download('movie_reviews')
nltk.download('stopwords')

from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 1) Data load & labels prepare
docs = [(movie_reviews.raw(fileid), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]
texts = [text for text, label in docs]
labels = [1 if label=='pos' else 0 for text, label in docs]

# 2) Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42)

# 3) TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf  = tfidf.transform(X_test)

# 4) Model training
model = LogisticRegression(max_iter=200)
model.fit(X_train_tf, y_train)

# 5) Prediction & Evaluation
y_pred = model.predict(X_test_tf)
print("Classification Report:\n")
print(classification_report(y_test, y_pred, digits=3))


In [31]:
# ==============================================
# 🏠 HOUSE PRICE PREDICTION (BOSTON HOUSING)
# ==============================================

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

# 🚨 Note: load_boston() is deprecated in newer sklearn versions
# We'll use the proper alternative dataset loading method
try:
    from sklearn.datasets import fetch_openml
    boston = fetch_openml(name='boston', version=1, as_frame=True)
    X_df = boston.data
    y = boston.target
except:
    # Fallback for older sklearn versions
    from sklearn.datasets import load_boston
    boston = load_boston()
    X_df = pd.DataFrame(boston.data, columns=boston.feature_names)
    y = boston.target

print("\n🔍 Dataset Preview:")
print(X_df.head())

# ==============================================
# 📊 1. LINEAR REGRESSION FROM SCRATCH
# ==============================================

print("\n🧮 Linear Regression from Scratch...")

try:
    # Add intercept term (column of 1s)
    X = np.hstack([np.ones((X_df.shape[0], 1)), X_df.values])

    # Normal equation: θ = (XᵀX)⁻¹Xᵀy
    theta = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)

    # Predictions
    y_pred_lin = X.dot(theta)

    # Calculate metrics
    rmse_lin = np.sqrt(mean_squared_error(y, y_pred_lin))
    r2_lin = r2_score(y, y_pred_lin)
    print(f"✅ Linear Regression → RMSE: {rmse_lin:.3f}, R²: {r2_lin:.3f}")

except Exception as e:
    print(f"❌ Linear Regression Error: {e}")

# ==============================================
# 🌳 2. RANDOM FOREST (FOR COMPARISON)
# ==============================================

print("\n🌳 Training Random Forest Model...")

try:
    from sklearn.ensemble import RandomForestRegressor

    rf = RandomForestRegressor(
        n_estimators=100,  # 100 decision trees
        random_state=1,     # Reproducible results
        n_jobs=-1          # Use all CPU cores
    )
    rf.fit(X_df, y)
    y_pred_rf = rf.predict(X_df)

    rmse_rf = np.sqrt(mean_squared_error(y, y_pred_rf))
    r2_rf = r2_score(y, y_pred_rf)
    print(f"✅ Random Forest → RMSE: {rmse_rf:.3f}, R²: {r2_rf:.3f}")

except Exception as e:
    print(f"❌ Random Forest Error: {e}")

# ==============================================
# 🚀 3. XGBOOST (IF AVAILABLE)
# ==============================================

print("\n🚀 Attempting XGBoost...")

try:
    from xgboost import XGBRegressor

    xgb = XGBRegressor(
        n_estimators=100,
        random_state=1,
        eval_metric='rmse',
        n_jobs=-1
    )
    xgb.fit(X_df, y)
    y_pred_xgb = xgb.predict(X_df)

    rmse_xgb = np.sqrt(mean_squared_error(y, y_pred_xgb))
    r2_xgb = r2_score(y, y_pred_xgb)
    print(f"✅ XGBoost → RMSE: {rmse_xgb:.3f}, R²: {r2_xgb:.3f}")

except ImportError:
    print("ℹ️ XGBoost not installed. To install: pip install xgboost")
except Exception as e:
    print(f"❌ XGBoost Error: {e}")

# ==============================================
# 📊 FINAL COMPARISON
# ==============================================

print("\n🏆 Model Comparison:")
try:
    results = pd.DataFrame({
        'Model': ['Linear Regression', 'Random Forest', 'XGBoost'],
        'RMSE': [rmse_lin, rmse_rf, rmse_xgb if 'rmse_xgb' in locals() else np.nan],
        'R²': [r2_lin, r2_rf, r2_xgb if 'r2_xgb' in locals() else np.nan]
    })
    print(results.dropna())
except:
    print("Could not generate comparison table")

print("\n🎉 Analysis Complete!")


🔍 Dataset Preview:
      CRIM    ZN  INDUS CHAS    NOX     RM   AGE     DIS RAD    TAX  PTRATIO  \
0  0.00632  18.0   2.31    0  0.538  6.575  65.2  4.0900   1  296.0     15.3   
1  0.02731   0.0   7.07    0  0.469  6.421  78.9  4.9671   2  242.0     17.8   
2  0.02729   0.0   7.07    0  0.469  7.185  61.1  4.9671   2  242.0     17.8   
3  0.03237   0.0   2.18    0  0.458  6.998  45.8  6.0622   3  222.0     18.7   
4  0.06905   0.0   2.18    0  0.458  7.147  54.2  6.0622   3  222.0     18.7   

        B  LSTAT  
0  396.90   4.98  
1  396.90   9.14  
2  392.83   4.03  
3  394.63   2.94  
4  396.90   5.33  

🧮 Linear Regression from Scratch...
❌ Linear Regression Error: can't multiply sequence by non-int of type 'float'

🌳 Training Random Forest Model...
✅ Random Forest → RMSE: 1.199, R²: 0.983

🚀 Attempting XGBoost...
❌ XGBoost Error: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical