<a href="https://colab.research.google.com/github/AnuragV2211/AnuragV2211/blob/main/My_Data_Analytics_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Topic-> Analyzing Public Sentiment Towards the New Education Policy: A Data-Driven Approach**

### **Program 1-> Sentiment analysis on new education policy text data using a RandomForestClassifier,TF-IDF vectorization and hyperparameter tuning for classification accuracy optimization.**

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib

# Load the dataset
dataset_path = "/content/drive/MyDrive/My Dataset.csv"
df = pd.read_csv(dataset_path)

# Split dataset into features (text) and target (sentiment)
X = df['text']
y = df['sentiment']

# Preprocess the text data using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_tfidf = tfidf_vectorizer.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Define the classifiers to be evaluated
classifiers = {
    'Random Forest': RandomForestClassifier()
}

# Perform grid search for hyperparameter tuning
best_model = None
best_accuracy = 0.0
for clf_name, clf in classifiers.items():
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20]
    }
    grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Evaluate the best model on the test set
    y_pred = grid_search.best_estimator_.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} - Best Accuracy: {grid_search.best_score_}, Test Accuracy: {accuracy}")

    # Save the best model if it has higher accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = grid_search.best_estimator_

# Save the best model and vectorizer for future use
joblib.dump(best_model, 'best_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

# Function to predict sentiment for new feedback
def predict_sentiment(feedback):
    # Load the trained model and vectorizer
    classifier = joblib.load('best_model.pkl')
    vectorizer = joblib.load('tfidf_vectorizer.pkl')

    # Preprocess the input feedback
    feedback_tfidf = vectorizer.transform([feedback])

    # Predict sentiment
    sentiment = classifier.predict(feedback_tfidf)

    return sentiment[0]

# Example usage
new_feedback = "Its emphasis on creativity and critical thinking empowers students to become lifelong learners and problem-solvers."
predicted_sentiment = predict_sentiment(new_feedback)
print("Predicted sentiment:", predicted_sentiment)

Random Forest - Best Accuracy: 0.7377777777777779, Test Accuracy: 0.6666666666666666
Predicted sentiment: neutral


### **Program 2-> Sentiment analysis on new education policy text data using a RandomForestClassifier is used in the pipeline, GradientBoostingClassifier, TF-IDF vectorization and incorporating data augmentation through synonym replacement and hyperparameter optimization.**

In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from nltk.corpus import wordnet
import random

# Load dataset
data = pd.read_csv("/content/drive/MyDrive/My Dataset.csv")

# Data augmentation function using synonym replacement
def augment_data(text):
    words = text.split()
    augmented_texts = []
    for i, word in enumerate(words):
        synonyms = set()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name())
        if synonyms:
            synonym = random.choice(list(synonyms))
            augmented_texts.append(' '.join(words[:i] + [synonym] + words[i+1:]))
    return augmented_texts

# Augmenting the data
augmented_X = []
augmented_y = []
for text, label in zip(data['text'], data['sentiment']):
    augmented_X.append(text)
    augmented_y.append(label)
    augmented_texts = augment_data(text)
    for augmented_text in augmented_texts:
        augmented_X.append(augmented_text)
        augmented_y.append(label)

# Split augmented data into features (X) and target (y)
X = augmented_X
y = augmented_y

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),  # Adjust max_features as needed
    ('clf', RandomForestClassifier(n_estimators=200, max_depth=30, random_state=42)) # Optimized hyperparameters
])

# Train the model
pipeline.fit(X_train, y_train)

# Predictions
predictions = pipeline.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Function to get sentiment analysis of user input
def get_sentiment(text):
    prediction = pipeline.predict([text])
    return prediction[0]

# Get feedback from user
new_feedback = "Its emphasis on creativity and critical thinking empowers students to become lifelong learners and problem-solvers."
# Perform sentiment analysis
sentiment = get_sentiment(new_feedback)

# Output sentiment
print("Sentiment analysis of your feedback:", sentiment)


Accuracy: 0.9841269841269841
Sentiment analysis of your feedback: neutral


### **Program 3-> Taking feedback as an input.**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from nltk.corpus import wordnet
import random

# Load dataset
data = pd.read_csv("/content/drive/MyDrive/My Dataset.csv")

# Data augmentation function using synonym replacement
def augment_data(text):
    words = text.split()
    augmented_texts = []
    for i, word in enumerate(words):
        synonyms = set()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                synonyms.add(lemma.name())
        if synonyms:
            synonym = random.choice(list(synonyms))
            augmented_texts.append(' '.join(words[:i] + [synonym] + words[i+1:]))
    return augmented_texts

# Augmenting the data
augmented_X = []
augmented_y = []
for text, label in zip(data['text'], data['sentiment']):
    augmented_X.append(text)
    augmented_y.append(label)
    augmented_texts = augment_data(text)
    for augmented_text in augmented_texts:
        augmented_X.append(augmented_text)
        augmented_y.append(label)

# Split augmented data into features (X) and target (y)
X = augmented_X
y = augmented_y

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),  # Adjust max_features as needed
    ('clf', RandomForestClassifier(n_estimators=200, max_depth=30, random_state=42)) # Optimized hyperparameters
])

# Train the model
pipeline.fit(X_train, y_train)

# Predictions
predictions = pipeline.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Function to get sentiment analysis of user input
def get_sentiment(text):
    prediction = pipeline.predict([text])
    return prediction[0]

# Get feedback from user
new_feedback = input("Enter your feedback ->")
# Perform sentiment analysis
sentiment = get_sentiment(new_feedback)

# Output sentiment
print("Sentiment analysis of your feedback:", sentiment)


Accuracy: 0.9920634920634921
Enter your feedback -> The feasibility of implementing such a large-scale reform needs careful consideration.
Sentiment analysis of your feedback: negative


Hello
