In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from wordcloud import WordCloud


In [None]:
# Load the dataset
data_path = r"C:\Users\subra\Desktop\Projects\Oasis\Stock_Market_Prediction\data\RedditNews.csv"
df = pd.read_csv(data_path)


In [None]:
# Display basic information about the dataset
print("First five rows of the dataset:")
print(df.head())

print("\nDataset information:")
print(df.info())


In [None]:
# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())

In [None]:
# Drop missing values if any
df.dropna(inplace=True)


In [None]:
# Exploratory Data Analysis (EDA)
# Word Cloud of Headlines
text = " ".join(headline for headline in df['News'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of News Headlines")
plt.show()


In [None]:
# Check distribution of sentiment labels (if applicable)
if 'Sentiment' in df.columns:
    print("\nDistribution of Sentiment Labels:")
    print(df['Sentiment'].value_counts())
    sns.countplot(x='Sentiment', data=df, palette='viridis')
    plt.title("Distribution of Sentiment Labels")
    plt.show()


In [None]:
# Feature and target selection
if 'Sentiment' in df.columns:
    X = df['News']  # Features
    y = df['Sentiment']  # Target (Sentiment labels like Positive, Negative, Neutral)
else:
    print("\nNo Sentiment column found. Assuming unsupervised learning or further analysis.")
    X = df['News']
    y = None



In [None]:
# Text preprocessing and feature extraction
vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X_vectorized = vectorizer.fit_transform(X)


In [None]:
# Train-test split (if Sentiment column exists)
if y is not None:
    X_train, X_test, y_train, y_test = train_test_split(X_vectorized, y, test_size=0.3, random_state=42)


In [None]:
# Train the model using Logistic Regression
    model = LogisticRegression()
    model.fit(X_train, y_train)

In [None]:
# Make predictions
    y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
    print("\nAccuracy of the model:")
    print(accuracy_score(y_test, y_pred))

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))


In [None]:
# Confusion Matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

In [None]:
# Unsupervised Learning or Analysis (if no Sentiment column)
if y is None:
    print("\nPerforming Word Frequency Analysis:")
    word_freq = pd.DataFrame(vectorizer.get_feature_names_out(), columns=['Word'])
    word_freq['Frequency'] = np.asarray(X_vectorized.sum(axis=0)).flatten()
    word_freq = word_freq.sort_values(by='Frequency', ascending=False)
    print(word_freq.head(20))