In [1]:
import pandas as pd
import re
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
import string

# Load the Data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Separate the data into features and labels
X_train = train_data['text']
y_train = train_data['sentiment']
X_test = test_data['text']

# Initialize necessary tools
lemmatizer = WordNetLemmatizer()
stops = stopwords.words('english') + list(string.punctuation)

# Helper function to get simple POS tags
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Text cleaning function
def clean_text(words):
    output_words = []
    for w in words.split():
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return ' '.join(output_words)

# Clean the datasets
X_train = X_train.apply(clean_text)
X_test = X_test.apply(clean_text)

# Initialize the CountVectorizer
vectorizer = CountVectorizer(max_features=2000)

# Create the RandomForestClassifier model pipeline
model = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)

# Save the predictions to a CSV file
pd.DataFrame(predictions).to_csv('predictions.csv', index=False, header=False)


FileNotFoundError: [Errno 2] No such file or directory: 'train_data.csv'

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import string
import time

# Load the Data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Separate the data into features and labels
X_train = train_data['text']
y_train = train_data['sentiment']

# Text preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english') + list(string.punctuation))

def preprocess_text(text):
    # Tokenization and removal of stopwords
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.lower() not in stop_words]
    return ' '.join(words)

X_train = X_train.apply(preprocess_text)
X_test = test_data['text'].apply(preprocess_text)

# Vectorization using CountVectorizer
count_vec = CountVectorizer(max_features=2000)
X_train_features = count_vec.fit_transform(X_train)
X_test_features = count_vec.transform(X_test)

# Model training using RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_features, y_train)

# Predictions
predictions = rfc.predict(X_test_features)

# Save predictions to a CSV file without headers
pd.Series(predictions).to_csv('predictions.csv', index=False, header=False)

KeyError: 'sentiment'

In [3]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import string

# Load the Data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Inspect the data to check column names
print("Train Data Columns: ", train_data.columns)
print("Test Data Columns: ", test_data.columns)

# Assuming the columns in train.csv are 'text' and 'sentiment'
# Separate the data into features and labels
X_train = train_data['text']
y_train = train_data['sentiment']


Train Data Columns:  Index(['tweet_id', 'airline_sentiment', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')
Test Data Columns:  Index(['tweet_id', 'airline', 'airline_sentiment_gold', 'name',
       'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')


KeyError: 'sentiment'

In [4]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import string

# Load the Data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# Inspect the data to check column names
print("Train Data Columns: ", train_data.columns)
print("Test Data Columns: ", test_data.columns)

# Using correct column names from the dataset
X_train = train_data['text']
y_train = train_data['airline_sentiment']

Train Data Columns:  Index(['tweet_id', 'airline_sentiment', 'airline', 'airline_sentiment_gold',
       'name', 'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')
Test Data Columns:  Index(['tweet_id', 'airline', 'airline_sentiment_gold', 'name',
       'negativereason_gold', 'retweet_count', 'text', 'tweet_coord',
       'tweet_created', 'tweet_location', 'user_timezone'],
      dtype='object')


In [5]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(nltk.corpus.stopwords.words('english') + list(string.punctuation))

def preprocess_text(text):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.lower() not in stop_words]
    return ' '.join(words)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anjal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anjal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [6]:
X_train = X_train.apply(preprocess_text)
X_test = test_data['text'].apply(preprocess_text)

# Vectorization using CountVectorizer
count_vec = CountVectorizer(max_features=2000)
X_train_features = count_vec.fit_transform(X_train)
X_test_features = count_vec.transform(X_test)

# Model training using RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train_features, y_train)

# Predictions
predictions = rfc.predict(X_test_features)

# Save predictions to a CSV file without headers
pd.Series(predictions).to_csv('predictions.csv', index=False, header=False)