<a href="https://colab.research.google.com/github/21pravi/23pravi/blob/main/GCN(First_draft).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader
from scipy.sparse import csr_matrix
from imblearn.combine import SMOTEENN  # Import SMOTEENN
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import contractions
import unidecode

# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Load your dataset
df = pd.read_csv('/content/data (1).csv')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

# Function to get wordnet pos tags
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Lowercase
    text = text.lower()
    # Expand contractions
    text = contractions.fix(text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove emojis
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Remove accents
    text = unidecode.unidecode(text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    # Stem words
    words = [porter.stem(word) for word in words]
    return ' '.join(words)



# Label sentiment based on ratings
conditions = [(df['Rating'] > 5), (df['Rating'] <= 5)]
values = [1, 0]
df['Sentiment'] = np.select(conditions, values)
df.drop(columns='Rating', axis=1, inplace=True)
df = df.dropna().reset_index(drop=True)

# Clean text in 'Review' column
df['Review'] = df['Review'].apply(clean_text)

vectorizer = TfidfVectorizer(max_features=10000)  # Increase the number of features
X = vectorizer.fit_transform(df['Review'])
y = df['Sentiment'].values

# Apply SMOTEENN
smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

def create_data_list(X, y):
    data_list = []
    for i in range(len(y)):
        coo = csr_matrix(X[i])  # Convert each sample to COO format
        row_start = coo.indptr[0]
        row_end = coo.indptr[-1]
        x = torch.tensor(np.asarray(coo.todense()), dtype=torch.float)
        # Create a dummy edge index (replace this with your actual graph connectivity)
        edge_index = torch.tensor([[], []], dtype=torch.long)
        y_val = torch.tensor([y[i]], dtype=torch.long)  # Wrap y[i] in a list to make it a single-element tensor
        # Create a Data object and append it to the list
        data = Data(x=x, edge_index=edge_index, y=y_val)
        data_list.append(data)
    return data_list

train_data_list = create_data_list(X_train, y_train)
test_data_list = create_data_list(X_test, y_test)

train_loader = DataLoader(train_data_list, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data_list, batch_size=32, shuffle=False)

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(p=0.5)  # Add dropout layer

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)  # Apply dropout
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)  # Apply dropout
        x = self.conv3(x, edge_index)
        x = global_mean_pool(x, batch=torch.zeros(x.size(0), dtype=torch.long))  # Apply global pooling
        return F.log_softmax(x, dim=1)

input_dim = X_train.shape[1]
hidden_dim = 64
hidden_dim = 32  # Increase hidden dimension
output_dim = 2

model = GCN(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)  # Decrease learning rate and add weight decay
criterion = torch.nn.CrossEntropyLoss()

def train_model(model, train_loader, optimizer, criterion, epochs=100):  # Increase number of epochs
    model.train()
    for epoch in range(epochs):
        for data in train_loader:
            optimizer.zero_grad()
            out = model(data)
            # Ensure the target is squeezed to match the shape of the model output
            loss = criterion(out, data.y.squeeze())
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

train_model(model, train_loader, optimizer, criterion)

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in test_loader:
            out = model(data)
            preds = out.argmax(dim=1)
            all_preds.append(preds.cpu().numpy())
            all_labels.append(data.y.cpu().numpy())
    accuracy = accuracy_score(np.hstack(all_labels), np.hstack(all_preds))
    report = classification_report(np.hstack(all_labels), np.hstack(all_preds))
    return accuracy, report

accuracy, report = evaluate_model(model, test_loader)
print(f'Accuracy: {accuracy}')
print(report)

###Import Libraries




In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
from torch_geometric.data import Data, DataLoader
from scipy.sparse import csr_matrix
from imblearn.combine import SMOTEENN  # Import SMOTEENN
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import contractions
import unidecode

###Preprocessing

In [None]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Load your dataset
df = pd.read_csv('/content/data (1).csv')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
porter = PorterStemmer()

# Function to get wordnet pos tags
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def clean_text(text):
    if not isinstance(text, str):
        return ""
    # Lowercase
    text = text.lower()
    # Expand contractions
    text = contractions.fix(text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    # Remove emojis
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    # Remove accents
    text = unidecode.unidecode(text)
    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    words = word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stop_words]
    # Lemmatize words
    words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    # Stem words
    words = [porter.stem(word) for word in words]
    return ' '.join(words)



# Label sentiment based on ratings
conditions = [(df['Rating'] > 5), (df['Rating'] <= 5)]
values = [1, 0]
df['Sentiment'] = np.select(conditions, values)
df.drop(columns='Rating', axis=1, inplace=True)
df = df.dropna().reset_index(drop=True)

# Clean text in 'Review' column
df['Review'] = df['Review'].apply(clean_text)

vectorizer = TfidfVectorizer(max_features=10000)  # Increase the number of features
X = vectorizer.fit_transform(df['Review'])
y = df['Sentiment'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

NameError: name 'train_test_split' is not defined

###Define GCN model

In [None]:
def create_data_list(X, y):
    data_list = []
    for i in range(len(y)):
        coo = csr_matrix(X[i])  # Convert each sample to COO format
        row_start = coo.indptr[0]
        row_end = coo.indptr[-1]
        x = torch.tensor(np.asarray(coo.todense()), dtype=torch.float)
        # Create a dummy edge index (replace this with your actual graph connectivity)
        edge_index = torch.tensor([[], []], dtype=torch.long)
        y_val = torch.tensor([y[i]], dtype=torch.long)  # Wrap y[i] in a list to make it a single-element tensor
        # Create a Data object and append it to the list
        data = Data(x=x, edge_index=edge_index, y=y_val)
        data_list.append(data)
    return data_list

train_data_list = create_data_list(X_train, y_train)
test_data_list = create_data_list(X_test, y_test)

train_loader = DataLoader(train_data_list, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data_list, batch_size=32, shuffle=False)

class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
        self.conv3 = GCNConv(hidden_dim, output_dim)
        self.dropout = torch.nn.Dropout(p=0.5)  # Add dropout layer

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)  # Apply dropout
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = self.dropout(x)  # Apply dropout
        x = self.conv3(x, edge_index)
        x = global_mean_pool(x, batch=torch.zeros(x.size(0), dtype=torch.long))  # Apply global pooling
        return F.log_softmax(x, dim=1)

input_dim = X_train.shape[1]
hidden_dim = 64
hidden_dim = 32 # Increase hidden dimension
output_dim = 2

model = GCN(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)  # Decrease learning rate and add weight decay
criterion = torch.nn.CrossEntropyLoss()

def train_model(model, train_loader, optimizer, criterion, epochs=50):  # Increase number of epochs
    model.train()
    for epoch in range(epochs):
        for data in train_loader:
            optimizer.zero_grad()
            out = model(data)
            # Ensure the target is squeezed to match the shape of the model output
            loss = criterion(out, data.y.squeeze())
            loss.backward()
            optimizer.step()
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

train_model(model, train_loader, optimizer, criterion)

def evaluate_model(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for data in test_loader:
            out = model(data)
            preds = out.argmax(dim=1)
            all_preds.append(preds.cpu().numpy())
            all_labels.append(data.y.cpu().numpy())
    accuracy = accuracy_score(np.hstack(all_labels), np.hstack(all_preds))
    report = classification_report(np.hstack(all_labels), np.hstack(all_preds))
    return accuracy, report

accuracy, report = evaluate_model(model, test_loader)
print(f'Accuracy: {accuracy}')
print(report)