In [None]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

stop_words = set(stopwords.words('english'))

df = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Remove URL
pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
subs_url = r'url-web'
df['message'] = df['message'].replace(to_replace=pattern_url, value=subs_url, regex=True)

# Remove punctuation and convert to lowercase
df['message'] = df['message'].str.lower().apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Tokenize Data
tokeniser = TreebankWordTokenizer()
df['message'] = df['message'].apply(tokeniser.tokenize)

# Stemming
stemmer = PorterStemmer()
df['message'] = df['message'].apply(lambda x: [stemmer.stem(word) for word in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['message'] = df['message'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Remove Stopwords
def remove_stop_words(tokens):
    return [t for t in tokens if t not in stop_words]

df['message'] = df['message'].apply(remove_stop_words)

# Feature Selection Splitting out Variables
y = df['sentiment']
X = df['message']

bow_vect = CountVectorizer(max_features=1000, stop_words='english')
X_vect = bow_vect.fit_transform(df['message'])

trans = StandardScaler(with_mean=False)
X_vect = trans.fit_transform(X_vect)

# Training Model
X_train, X_val, y_train, y_val = train_test_split(X_vect, y, test_size=0.2, random_state=42)

# Model training
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_val)

# Test Set
testx = test['message']
test_vect = bow_vect.transform(testx)

# Predictions
y_pred = rfc.predict(test_vect)

# Test
test['sentiment'] = y_pred
test[['tweetid', 'sentiment']].to_csv('testsubmission.csv', index=False)