In [1]:
print("Jupyter is working")

Jupyter is working


In [3]:
sample_posts = [
    "I love this!",
    "This is bad.",
    "Today is amazing!",
    "I hate everything.",
    "This project is fun."
]

for post in sample_posts:
    print(post)


I love this!
This is bad.
Today is amazing!
I hate everything.
This project is fun.


In [5]:
import re

def clean(text):
    text = text.lower()                      # lowercase
    text = re.sub(r"http\S+", "", text)      # remove links
    text = re.sub(r"[^a-z\s]", "", text)     # keep only letters/spaces
    return text.strip()

for post in sample_posts:
    print("original:", post)
    print("cleaned :", clean(post))
    print("---")



original: I love this!
cleaned : i love this
---
original: This is bad.
cleaned : this is bad
---
original: Today is amazing!
cleaned : today is amazing
---
original: I hate everything.
cleaned : i hate everything
---
original: This project is fun.
cleaned : this project is fun
---


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Clean all posts first
cleaned_posts = [clean(p) for p in sample_posts]

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(cleaned_posts)

print("Shape:", vectors.shape)
print(vectors.toarray())


Shape: (5, 10)
[[0.         0.         0.         0.         0.         0.
  0.83088075 0.         0.55645052 0.        ]
 [0.         0.72604443 0.         0.         0.         0.48624042
  0.         0.         0.48624042 0.        ]
 [0.63907044 0.         0.         0.         0.         0.42799292
  0.         0.         0.         0.63907044]
 [0.         0.         0.70710678 0.         0.70710678 0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.58752141 0.         0.39346994
  0.         0.58752141 0.39346994 0.        ]]


In [9]:
from sklearn.linear_model import LogisticRegression

# Fake labels for our tiny example
# 1 = positive, 0 = negative
labels = [1, 0, 1, 0, 1]  

model = LogisticRegression()
model.fit(vectors, labels)

print("Model trained!")


Model trained!


In [11]:
new_text = ["I am very happy today!"]

# clean it
cleaned_new = [clean(t) for t in new_text]

# convert to numbers
vec_new = vectorizer.transform(cleaned_new)

# predict
prediction = model.predict(vec_new)

print("Prediction:", prediction)



Prediction: [1]


In [13]:
def predict_sentiment(text):
    cleaned = clean(text)
    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)[0]
    return "positive" if pred == 1 else "negative"

print(predict_sentiment("I love this project!"))
print(predict_sentiment("This is terrible"))


positive
positive


In [15]:
import pandas as pd

# Change filename if your file is named differently and is inside social-project/data/
csv_path = "data/training.1600000.processed.noemoticon.csv"

# load only first 10000 rows to keep it fast
df = pd.read_csv(csv_path, encoding='latin-1', header=None, nrows=10000)

# Sentiment140 format: columns -> [target, id, date, flag, user, text]
df = df[[0,5]]
df.columns = ['label', 'text']

# Convert labels (Sentiment140 uses 0=negative, 4=positive) -> convert to 0/1
df['label'] = df['label'].apply(lambda x: 1 if x==4 else 0)

# quick inspect
print("Rows:", len(df))
print(df['label'].value_counts())
df.head(5)


FileNotFoundError: [Errno 2] No such file or directory: 'data/training.1600000.processed.noemoticon.csv'

In [17]:
import os

os.listdir()



['notes.txt',
 'anaconda_projects',
 'Untitled.ipynb',
 'test.py',
 '.ipynb_checkpoints']

In [19]:
import pandas as pd

# Change filename if your file is named differently and is inside social-project/data/
csv_path = "data/training.1600000.processed.noemoticon.csv"

# load only first 10000 rows to keep it fast
df = pd.read_csv(csv_path, encoding='latin-1', header=None, nrows=10000)

# Sentiment140 format: columns -> [target, id, date, flag, user, text]
df = df[[0,5]]
df.columns = ['label', 'text']

# Convert labels (Sentiment140 uses 0=negative, 4=positive) -> convert to 0/1
df['label'] = df['label'].apply(lambda x: 1 if x==4 else 0)

# quick inspect
print("Rows:", len(df))
print(df['label'].value_counts())
df.head(5)


FileNotFoundError: [Errno 2] No such file or directory: 'data/training.1600000.processed.noemoticon.csv'

In [22]:
import os

os.makedirs("data", exist_ok=True)
os.listdir()


['notes.txt',
 'anaconda_projects',
 'Untitled.ipynb',
 'test.py',
 '.ipynb_checkpoints',
 'data']

In [24]:
import pandas as pd

# Change filename if your file is named differently and is inside social-project/data/
csv_path = "data/training.1600000.processed.noemoticon.csv"

# load only first 10000 rows to keep it fast
df = pd.read_csv(csv_path, encoding='latin-1', header=None, nrows=10000)

# Sentiment140 format: columns -> [target, id, date, flag, user, text]
df = df[[0,5]]
df.columns = ['label', 'text']

# Convert labels (Sentiment140 uses 0=negative, 4=positive) -> convert to 0/1
df['label'] = df['label'].apply(lambda x: 1 if x==4 else 0)

# quick inspect
print("Rows:", len(df))
print(df['label'].value_counts())
df.head(5)


Rows: 10000
label
0    10000
Name: count, dtype: int64


Unnamed: 0,label,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [26]:
import pandas as pd

csv_path = "data/training.1600000.processed.noemoticon.csv"

# load first 100k rows (fast enough) and inspect raw label values
raw = pd.read_csv(csv_path, encoding='latin-1', header=None, usecols=[0], nrows=100000)
print("Loaded rows:", len(raw))
print("Raw label value counts:")
print(raw[0].value_counts())

# show the first 10 rows for a quick look
raw_full = pd.read_csv(csv_path, encoding='latin-1', header=None, nrows=10)
print("\nSample rows (first 10):")
print(raw_full)


Loaded rows: 100000
Raw label value counts:
0
0    100000
Name: count, dtype: int64

Sample rows (first 10):
   0           1                             2         3                4  \
0  0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY  _TheSpecialOne_   
1  0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY    scotthamilton   
2  0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY         mattycus   
3  0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY          ElleCTF   
4  0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY           Karoli   
5  0  1467811372  Mon Apr 06 22:20:00 PDT 2009  NO_QUERY         joy_wolf   
6  0  1467811592  Mon Apr 06 22:20:03 PDT 2009  NO_QUERY          mybirch   
7  0  1467811594  Mon Apr 06 22:20:03 PDT 2009  NO_QUERY             coZZ   
8  0  1467811795  Mon Apr 06 22:20:05 PDT 2009  NO_QUERY  2Hood4Hollywood   
9  0  1467812025  Mon Apr 06 22:20:09 PDT 2009  NO_QUERY          mimismo   

                                           

In [28]:
import pandas as pd

csv_path = "data/training.1600000.processed.noemoticon.csv"

# load rows 500,000 to 510,000 (a slice with mixed sentiment)
df_mixed = pd.read_csv(
    csv_path,
    encoding='latin-1',
    header=None,
    skiprows=500000,
    nrows=10000
)

df_mixed = df_mixed[[0,5]]
df_mixed.columns = ["label", "text"]

print(df_mixed['label'].value_counts())
df_mixed.head()


label
0    10000
Name: count, dtype: int64


Unnamed: 0,label,text
0,0,i cant sleep
1,0,@alba17 Sorry about kid situation. Good luck w...
2,0,nhá» nhÃ quÃ¡!!! cá»© má»i láº§n nghe bÃ i ...
3,0,Missing Him!! Twitter Me RED?? What the heck i...
4,0,#musicmonday i got the blues today ***sad ...


In [30]:
df_mixed['label'].value_counts()


label
0    10000
Name: count, dtype: int64

In [32]:
import pandas as pd

df = pd.read_csv("data/twitter_training.csv")
print(df.head())
print(df.columns)
print(df['label'].value_counts())


   2401  Borderlands  Positive  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

  im getting on borderlands and i will murder you all ,  
0  I am coming to the borders and I will kill you...     
1  im getting on borderlands and i will kill you ...     
2  im coming on borderlands and i will murder you...     
3  im getting on borderlands 2 and i will murder ...     
4  im getting into borderlands and i can murder y...     
Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')


KeyError: 'label'

In [34]:
import pandas as pd

df = pd.read_csv(
    "data/twitter_training.csv",
    header=None,   # IMPORTANT
    names=["id", "topic", "label", "text"]
)

print(df.head())
print(df['label'].value_counts())


     id        topic     label  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
label
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64


In [36]:
df = df[df['label'] != "Irrelevant"]   # remove irrelevant class
df = df.reset_index(drop=True)

print(df['label'].value_counts())


label
Negative    22542
Positive    20832
Neutral     18318
Name: count, dtype: int64


In [38]:
import re

def clean(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)       # remove links
    text = re.sub(r"@\w+", "", text)          # remove @mentions
    text = re.sub(r"[^a-z\s]", "", text)      # keep only letters/spaces
    text = re.sub(r"\s+", " ", text).strip()  # clean extra spaces
    return text

df['clean_text'] = df['text'].apply(clean)

df[['text','clean_text']].head()


Unnamed: 0,text,clean_text
0,im getting on borderlands and i will murder yo...,im getting on borderlands and i will murder yo...
1,I am coming to the borders and I will kill you...,i am coming to the borders and i will kill you...
2,im getting on borderlands and i will kill you ...,im getting on borderlands and i will kill you all
3,im coming on borderlands and i will murder you...,im coming on borderlands and i will murder you...
4,im getting on borderlands 2 and i will murder ...,im getting on borderlands and i will murder yo...


In [40]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'], 
    df['label'], 
    test_size=0.2, 
    random_state=42
)

# Create vectorizer (limit features so it's fast)
vectorizer = TfidfVectorizer(max_features=5000)

# Fit-transform on training data
X_train_vec = vectorizer.fit_transform(X_train)

# Transform test data
X_test_vec = vectorizer.transform(X_test)

print("Train shape:", X_train_vec.shape)
print("Test shape:", X_test_vec.shape)


Train shape: (49353, 5000)
Test shape: (12339, 5000)


In [42]:
# Train & evaluate a simple classifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Encode labels to integers
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Train a logistic regression (fast, baseline)
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train_vec, y_train_enc)

# Predict on test set
y_pred = model.predict(X_test_vec)

# Eval
acc = accuracy_score(y_test_enc, y_pred)
print("Accuracy:", acc)
print("\nClassification report:")
print(classification_report(y_test_enc, y_pred, target_names=le.classes_))

print("\nConfusion matrix (rows=true, cols=pred):")
print(confusion_matrix(y_test_enc, y_pred))


Accuracy: 0.74981765134938

Classification report:
              precision    recall  f1-score   support

    Negative       0.79      0.77      0.78      4509
     Neutral       0.69      0.71      0.70      3650
    Positive       0.76      0.76      0.76      4180

    accuracy                           0.75     12339
   macro avg       0.75      0.75      0.75     12339
weighted avg       0.75      0.75      0.75     12339


Confusion matrix (rows=true, cols=pred):
[[3465  608  436]
 [ 475 2606  569]
 [ 426  573 3181]]


In [44]:
# Run this in your notebook (where `model` and `vectorizer` already exist)
import joblib
import os

os.makedirs("artifacts", exist_ok=True)
joblib.dump(model, "artifacts/sentiment_model.joblib")
joblib.dump(vectorizer, "artifacts/vectorizer.joblib")

print("Saved model and vectorizer to artifacts/")


Saved model and vectorizer to artifacts/
