 TASK 1: Movie_Genre_Classification

In [None]:
# =========================================
# Task 1: Movie Genre Classification
# =========================================

import zipfile
import os
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# 1. Extract ZIP
zip_path = "/content/movies.csv.zip"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("/content")

print("ZIP file extracted successfully")

# 2. Find train_data.txt recursively
train_file = None
for root, dirs, files in os.walk("/content"):
    for file in files:
        if file == "train_data.txt":
            train_file = os.path.join(root, file)
            break

if train_file is None:
    raise FileNotFoundError("train_data.txt not found")

print("Training file found at:", train_file)

# 3. Load training data (TXT format)
data = pd.read_csv(
    train_file,
    sep=" ::: ",
    engine="python",
    names=["id", "title", "genre", "plot"]
)

print("Dataset shape:", data.shape)
print(data.head())

# 4. Separate input and output
X = data["plot"]
y = data["genre"]

# 5. TF-IDF Vectorization
tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000
)

X_tfidf = tfidf.fit_transform(X.astype(str))

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y,
    test_size=0.2,
    random_state=42
)

# 7. Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# 8. Accuracy
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\nModel Accuracy:", accuracy)

# 9. Sample predictions
test_movies = [
    "A brave police officer fights criminals to save the city",
    "Two college students fall in love but face family pressure",
    "A family moves into a haunted house with supernatural events",
    "A detective investigates a mysterious murder case"
]

test_tfidf = tfidf.transform(test_movies)
predictions = model.predict(test_tfidf)

print("\nSample Predictions:")
for movie, genre in zip(test_movies, predictions):
    print("Plot:", movie)
    print("Predicted Genre:", genre)
    print("-" * 50)




ZIP file extracted successfully
Training file found at: /content/Genre Classification Dataset/train_data.txt
Dataset shape: (54214, 4)
   id                             title     genre  \
0   1      Oscar et la dame rose (2009)     drama   
1   2                      Cupid (1997)  thriller   
2   3  Young, Wild and Wonderful (1980)     adult   
3   4             The Secret Sin (1915)     drama   
4   5            The Unrecovered (2007)     drama   

                                                plot  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  

Model Accuracy: 0.5225491100249009

Sample Predictions:
Plot: A brave police officer fights criminals to save the city
Predicted Genre: drama
--------------------------------------------------
Plot: Two college stud

TASK:2 CUSTOMER_CHURN_PREDICTION


In [None]:
# ===============================
# CUSTOMER CHURN PREDICTION
# ===============================

import zipfile
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# ---------- 1. UNZIP DATASET ----------
zip_path = "/content/archive (7).zip"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("/content/data")

# ---------- 2. LOAD CSV FILE ----------
for file in os.listdir("/content/data"):
    if file.endswith(".csv"):
        df = pd.read_csv("/content/data/" + file)
        break

# ---------- 3. DROP USELESS COLUMNS ----------
df = df.drop(['RowNumber', 'CustomerId', 'Surname'], axis=1)

# ---------- 4. SEPARATE FEATURES & TARGET ----------
X = df.drop('Exited', axis=1)
y = df['Exited']

# ---------- 5. ENCODE CATEGORICAL DATA ----------
le = LabelEncoder()
X['Geography'] = le.fit_transform(X['Geography'])
X['Gender'] = le.fit_transform(X['Gender'])

# ---------- 6. TRAIN TEST SPLIT ----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------- 7. SCALE DATA (FOR LOGISTIC REGRESSION) ----------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------- 8. LOGISTIC REGRESSION ----------
lr = LogisticRegression(max_iter=5000)
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))

# ---------- 9. RANDOM FOREST ----------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Logistic Regression Accuracy: 0.8155
Random Forest Accuracy: 0.8645


 TASK:3 SPAM_SMS_DETECTION




In [None]:
# ================================
# SPAM SMS DETECTION
# ================================

# 1. Import libraries
import pandas as pd
import zipfile

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# 2. Unzip dataset
zip_path = "/content/archive (8).zip"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall("/content")

print("ZIP file extracted")

# 3. Load dataset (common file name: spam.csv)
data = pd.read_csv("/content/spam.csv", encoding="latin-1")

data = data[['v1', 'v2']]
data.columns = ['label', 'message']

# 4. Convert labels to numbers
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# 5. Split data
X = data['message']
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# 6. TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english')

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# 7. Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# 8. Evaluate model
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print("Model Accuracy:", accuracy)

# 9. Prediction function
def check_sms(text):
    text_tfidf = tfidf.transform([text])
    prediction = model.predict(text_tfidf)

    if prediction[0] == 1:
        print("SPAM MESSAGE")
    else:
        print("HAM (NOT SPAM)")

# 10. Test messages
test_messages = [
    "Congratulations! You won a free prize",
    "I will call you later",
    "Urgent! Claim your reward now",
    "Are you coming to college today?",
    "Limited offer just for you",
    "Meeting is postponed to tomorrow"
]

for msg in test_messages:
    print("Message:", msg)
    check_sms(msg)
    print("-" * 40)


ZIP file extracted
Model Accuracy: 0.9668161434977578
Message: Congratulations! You won a free prize
SPAM MESSAGE
----------------------------------------
Message: I will call you later
HAM (NOT SPAM)
----------------------------------------
Message: Urgent! Claim your reward now
SPAM MESSAGE
----------------------------------------
Message: Are you coming to college today?
HAM (NOT SPAM)
----------------------------------------
Message: Limited offer just for you
HAM (NOT SPAM)
----------------------------------------
Message: Meeting is postponed to tomorrow
HAM (NOT SPAM)
----------------------------------------


 TASK:4 Handwritten_Text_Generatio

In [10]:
# Handwritten Text Generation using Character-Level RNN

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# --------------------------------------------------
# Create handwritten-style text dataset
# --------------------------------------------------
text = """
hello my name is Ashwitha
this project demonstrates handwritten text generation
recurrent neural networks learn character patterns
lstm models are useful for sequence data
machine learning internship task
"""

with open("handwritten_text.txt", "w") as f:
    f.write(text)

with open("handwritten_text.txt", "r") as f:
    text = f.read().lower()

# --------------------------------------------------
#  Character encoding
# --------------------------------------------------
chars = sorted(list(set(text)))
char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for i, c in enumerate(chars)}

# --------------------------------------------------
# Create training sequences
# --------------------------------------------------
seq_len = 20
X = []
y = []

for i in range(len(text) - seq_len):
    X.append([char_to_idx[c] for c in text[i:i + seq_len]])
    y.append(char_to_idx[text[i + seq_len]])

X = np.array(X) / len(chars)
y = tf.keras.utils.to_categorical(y, num_classes=len(chars))

# --------------------------------------------------
# Build RNN model
# --------------------------------------------------
model = Sequential()
model.add(LSTM(128, input_shape=(seq_len, 1)))
model.add(Dense(len(chars), activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam")

# --------------------------------------------------
# Train the model
# --------------------------------------------------
model.fit(
    X.reshape(X.shape[0], seq_len, 1),
    y,
    epochs=20,
    batch_size=32
)

# --------------------------------------------------
# Generate handwritten-style text
# --------------------------------------------------
def generate_text(start_text, length=200):
    result = start_text.lower()

    while len(result) < seq_len:
        result = " " + result

    for _ in range(length):
        seq = [char_to_idx[c] for c in result[-seq_len:]]
        seq = np.array(seq) / len(chars)
        seq = seq.reshape(1, seq_len, 1)

        pred = model.predict(seq, verbose=0)
        result += idx_to_char[np.argmax(pred)]

    return result

print("\nGenerated Text:\n")
print(generate_text("this project "))


Epoch 1/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - loss: 3.2091
Epoch 2/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 3.1466
Epoch 3/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 3.0262
Epoch 4/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 2.9219
Epoch 5/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 2.9014
Epoch 6/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 2.8558
Epoch 7/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 2.8342
Epoch 8/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 2.8317
Epoch 9/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 2.8652
Epoch 10/20
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - loss: 2.7981
Epoch 11/20
[1m6/6