In [1]:
# There have been some issues with duplicate tweets being pulled into the collection.
# Let's sort this out!

# We're using the pymongo module to connect to the mLab database where the tweets are being stored
import pymongo

In [74]:
# Set up the "client" we want to connect to
client = pymongo.MongoClient("mongodb://shareduser:shareduser@ds123614.mlab.com:23614/bmlm-parking-tweets")

# Get the correct database (there's only one!)
db = client["bmlm-parking-tweets"]

# Now get the right collection
collection = db.tweets

In [75]:
# We need to use the "aggregation pipeline" to identify tweets with a non-unique "id" field
cursor = collection.aggregate([
    # First, we create one "group document" for each unique tweet id in the database.
    # Each one has 3 fields:
    #   - "_id": Every document in a mongo collection needs a unique mongo _id. We set each
    #            group document's _id to be the tweet id we are considering.
    #   - "unique_ids": Every time we see a tweet with a tweet id equal to the group
    #                   document's _id, we add that tweet's unique mongo _id to this list.
    #   - "count": Every time we add a tweet _id to unique_ids, we add 1 to this total.
    {"$group": { 
        "_id": {"id": "$id"}, 
        "unique_ids": {"$addToSet": "$_id"},
        "count": {"$sum": 1} 
      }
    }, 
    # Then, we "match" all the group documents with a count of 1; i.e. we match all the
    # groups with an _id which is the id of more than one tweet in the collection.
    {"$match": { 
        "count": {"$gt": 1} 
      }
    }
])

In [76]:
# Set up an empty list to hold the mongo _ids of the documents we want to delete
to_delete = []

# For each group document we've just created:
for doc in cursor:
    # Add all-but-first tweet _id to the deletion list
    # (remember we want to keep 1 tweet from each group!)
    to_delete.extend(doc["unique_ids"][1:])

In [77]:
# Check how many tweets we are going to remove
len(to_delete)

28376

In [78]:
# Bin 'em
collection.delete_many({"_id": {"$in": to_delete}})

<pymongo.results.DeleteResult at 0x22179f8aa48>